Compare commits

...

140 Commits

Author SHA1 Message Date
Tom
165bf5aca2 Tests passing checkpoint 2025-06-03 14:57:27 +02:00
Tom
aaafa28dfb A bit more on the rust backend 2025-05-29 17:09:17 +02:00
Tom Hodson
3328a0375b Fix update script a bit 2025-05-23 16:45:37 +00:00
Tom
ba2c67d812 Create example ingestion script 2025-05-23 10:55:32 +01:00
Tom
04b4ee24eb Silence protobuf warning 2025-05-22 17:26:58 +01:00
Tom
7069b70dd4 remove prints 2025-05-22 14:42:49 +01:00
Tom
90ea736c43 flesh out rust implementation 2025-05-22 14:40:44 +01:00
Tom
959dac332d Start writing rust backend 2025-05-19 10:20:12 +01:00
Tom
97c5abc38b
Update image link 2025-05-14 10:33:38 +01:00
Tom
1188733034 Update re 2025-05-14 10:21:48 +01:00
Tom
35bb8f0edd Massive rewrite 2025-05-14 10:14:02 +01:00
Tom
ed4a9055fa fix bug add testcases 2025-05-12 14:40:16 +01:00
Tom
110046b251 progress on metadata 2025-05-09 17:25:00 +01:00
Tom
a85b700084 Merge branch 'main' into metadata 2025-05-07 15:47:40 +01:00
Tom
271d06c65a Update banner.svg 2025-05-06 15:16:34 +01:00
Tom
7c28c7023b
Update README.md 2025-05-06 15:16:22 +01:00
Tom
4924fdb804 Add banner 2025-05-06 10:19:11 +01:00
Tom
d246dae54d Update climate_dt.json 2025-04-30 14:17:15 +02:00
Tom
07f9a24daa Add require_match argument to select 2025-04-30 14:05:42 +02:00
Tom
b13a06a0cc Update .gitignore 2025-04-30 14:05:42 +02:00
Tom
87c57ec2cc add selection test 2025-04-30 14:05:42 +02:00
Tom
a957d26da7
Update README.md 2025-04-30 11:06:48 +02:00
Tom
80b0408722 Update README ulrs 2025-04-24 10:59:42 +01:00
Tom
79e9f83c8c Add note about live server to docs 2025-04-24 10:54:11 +01:00
Tom
fff00ca6f1 Add test cads.json 2025-04-24 10:30:32 +01:00
Tom
fa646aee77 cosmetics 2025-04-24 10:28:52 +01:00
Tom
e04c0dd3bc Add tests 2025-04-23 14:40:09 +01:00
Tom
4e777f295d More work on metadata 2025-04-23 14:40:09 +01:00
Tom
1259ff08b6 first attempt 2025-04-23 14:38:33 +01:00
Tom
7b36a76154 Fix all of mypy's complaints. 2025-04-23 12:43:49 +01:00
Tom
10106ba6d8 Fix link templates 2025-04-17 09:31:51 +01:00
Tom Hodson
2fa99d775c fix mars language 2025-04-16 16:19:34 +00:00
Tom
b5c2681f63 Quick and dirty simple stac endpoint 2025-04-16 17:05:40 +01:00
Tom
dfc61caa38 Moves installation -> development 2025-04-16 15:20:38 +01:00
Tom
a502cb6ab2 Rejig quickstart a bit 2025-04-16 15:13:06 +01:00
Tom Hodson
ca944521f9 fix 2025-04-15 14:04:52 +00:00
Tom Hodson
6ec4b044b8 fix urls 2025-04-15 14:02:01 +00:00
Tom Hodson
251bec14fc update webapp 2025-04-15 13:50:03 +00:00
Tom
c3556ce6fa Fix example code url 2025-04-15 14:43:37 +01:00
Tom
bf47401e6c Add Qube.load 2025-04-12 16:06:29 +02:00
Tom
11014b07ea Update climate_dt.json 2025-04-03 14:03:44 +01:00
Tom
b6a27fdadf Update Qube.py 2025-04-03 14:02:50 +01:00
Tom
70b1fd65e5 add remove_by_key and improve compression 2025-04-02 17:51:20 +01:00
Tom
2e36db4268 Create od.json 2025-03-31 19:11:42 +01:00
Tom
6039a3a494 update cmd app 2025-03-31 19:11:15 +01:00
Tom
7ef930bc1c Update climate_dt.json 2025-03-31 18:45:40 +01:00
Tom
79983f85a1 clean up chart 2025-03-31 17:24:20 +01:00
Tom
ab2f8cf3f3 Integrate web_query_builder and stac server backend 2025-03-31 16:36:04 +01:00
Tom Hodson
4502a942cb integrate stac-server and web app 2025-03-31 12:48:13 +00:00
Tom Hodson
3017185950 stac_server 2025-03-31 12:40:23 +00:00
Tom
cd26905261
Update structured_stac.md 2025-03-31 10:37:56 +01:00
Tom
0d3c8248b0 update chart 2025-03-31 10:10:05 +01:00
Tom
b2aba5dd42 update build 2025-03-28 18:08:31 +00:00
Tom
79b97fd813 Update stac server and frontend 2025-03-28 17:50:29 +00:00
Tom
275831d186 Update climate dt schema 2025-03-28 16:32:46 +00:00
Tom
57877e1e0c Update extremes_dt.json 2025-03-28 09:52:29 +00:00
Tom
cf9db41dc4 add extremes_dt test data 2025-03-28 09:31:26 +00:00
Tom
8f1735c076 Update .gitignore 2025-03-28 09:29:13 +00:00
Tom
39f348244d Better error message for from_tree 2025-03-27 18:30:39 +00:00
Tom
2884f9fff8 update docs 2025-03-27 18:30:12 +00:00
Tom
df5360f29a Add convert_dtypes and selection with functions 2025-03-27 16:02:58 +00:00
Tom
d2f3165fe8 Remove print statement 2025-03-25 15:07:06 +00:00
Tom
6b98f7b7a9 Add creation from tree representation 2025-03-25 15:01:23 +00:00
Tom
9beaaa2e10 make consumption by selection off by default 2025-03-24 15:30:34 +00:00
Tom
06c84fb20e Fix selection bug to require that arguments be consumed by a branch 2025-03-24 15:28:06 +00:00
Tom
c31467fb04 Update fiab.md 2025-03-04 19:18:42 +01:00
Tom
6648502bf4 Add experimental wildcard value 2025-03-04 19:02:55 +01:00
Tom
e14b9ee12f add extra reqs 2025-03-03 15:50:28 +00:00
Tom
48444cc3ce Rename Values -> ValueGroup 2025-02-27 16:46:16 +00:00
Tom
8306fb4c3e Add cmd line app 2025-02-27 16:45:57 +00:00
Tom
68ad80e435 Add pre-commit hooks and run them 2025-02-26 09:11:30 +00:00
Tom
162dd48748 Unfreeze datastructures 2025-02-24 13:24:33 +00:00
Tom
ef844c9b57 Add alt-click copy of nodes paths, flesh out range types 2025-02-24 11:06:11 +00:00
Tom
1f7c5dfecd Remove notebooks 2025-02-21 10:50:14 +00:00
Tom
a23f366969 add docs requirements 2025-02-21 08:19:59 +00:00
Tom
ecccf336b4 Merge branch 'fiab' 2025-02-20 15:51:13 +00:00
Tom
4c941d34f8 Add fiab docs 2025-02-20 15:51:02 +00:00
Tom
a832e44e03
Add pypi version and wheel 2025-02-20 14:03:54 +00:00
Tom
11516a05ba Update example_products.md 2025-02-20 13:14:11 +00:00
Tom
8f5b202621 Update example_products.md 2025-02-20 13:14:11 +00:00
Tom
4a16d16748 Create example_products.md 2025-02-20 13:14:11 +00:00
Tom
3de40e46ef add example structure 2025-02-20 13:14:11 +00:00
Tom
8a2c5b341d sort pressure levels 2025-02-20 13:14:11 +00:00
Harrison Cook
819c29d768 Add frequency 2025-02-20 13:14:11 +00:00
Tom
ee546cd788 A version that works on windows 2025-02-19 17:22:53 +00:00
Tom
9873241eab Update update_version.sh 2025-02-19 17:12:23 +00:00
Tom
e432040321 swtich from cargo edit to sed 2025-02-19 17:00:57 +00:00
Tom
2d0c301062 Another try 2025-02-19 16:47:55 +00:00
Tom
52a82447f9 Another attempt 2025-02-19 16:14:24 +00:00
Tom
a70bd9f0cd Ok another try to get dynamic versions working 2025-02-19 16:04:22 +00:00
Tom
1ca23ca4cf Add a basic rust test in 2025-02-19 15:08:21 +00:00
Tom
bb61e6fe7c fix deps 2025-02-19 15:00:38 +00:00
Tom
73dd9a16a8 Make the rust backend non-optional 2025-02-19 14:57:11 +00:00
Tom
967adb1a69 Tidy up package and remove version from cargo.toml 2025-02-19 14:56:22 +00:00
Tom
4bcb09180e Add .datacubes() 2025-02-19 14:17:47 +00:00
Tom
ea07545dc0 fix set operations 2025-02-18 17:50:28 +00:00
Tom
9d4fcbe624 Set operations done 2025-02-18 07:15:22 +00:00
Tom
fe00bb1c7f Add span and axes 2025-02-14 15:59:32 +00:00
Tom
af69d2fe00 Start fleshing out set operations 2025-02-14 15:34:11 +00:00
Tom
62c7a49c59 Start filling out set operations 2025-02-14 09:13:55 +00:00
Tom
adeccec4e5 Add more docs 2025-02-14 09:13:55 +00:00
Tom
dca1e628df
Update README.md 2025-02-13 15:30:15 +00:00
Tom
8fc87955d4 Add first test 2025-02-13 14:11:19 +00:00
Tom
ee1a5aa61b Update actions 2025-02-13 14:01:49 +00:00
Tom
0abbdfab15 Update .gitignore 2025-02-13 13:48:50 +00:00
Tom
2392531ec7 Update pyproject.toml 2025-02-13 13:48:34 +00:00
Tom
00ea804c35 Get rid of old code 2025-02-13 13:48:27 +00:00
Tom
fbf8a0fcaf Update docs 2025-02-13 13:48:02 +00:00
Tom
1ab51646b9 add command line entrypoint 2025-02-12 18:48:45 +00:00
Tom
516a4abb85 add a maturin CI runner 2025-02-12 18:27:23 +00:00
Tom
515e373c18 make maturin optional 2025-02-12 18:20:50 +00:00
Tom
ee1bee2a01 Add nicer formatting for html 2025-02-12 15:06:26 +00:00
Tom
37298b7096 move tests around 2025-02-12 13:43:38 +00:00
Tom
dc52408e35 Update __init__.py 2025-02-12 13:05:40 +00:00
Tom
db3c18b3c8 more 2025-02-12 13:00:15 +00:00
Tom
6277920ac4 comment out rsfdb for now 2025-02-12 12:57:24 +00:00
Tom
c537028bb7 make rust optional 2025-02-12 12:51:58 +00:00
Tom
be5a81c400 remove patches 2025-02-12 12:48:42 +00:00
Tom
084ad96569 Update .readthedocs.yaml 2025-02-12 12:47:16 +00:00
Tom
7bafcda627 reogranise 2025-02-12 12:44:30 +00:00
Tom
847bd0ab12 Add docs deps 2025-02-12 11:06:27 +00:00
Tom
bcd2d8feae ignore build 2025-02-12 11:05:53 +00:00
Tom
9928aacee2 add docs 2025-02-12 11:05:26 +00:00
Tom
b6c1f76698
Update background.md 2025-02-11 17:44:37 +00:00
Tom
e703b5f308
Update background.md 2025-02-11 17:43:36 +00:00
Tom
1dc9177a91 Updates 2025-02-11 17:39:48 +00:00
Tom
609e3e9f74 updates 2025-02-10 15:26:25 +00:00
Tom
81a478a58f add tree.json 2025-02-06 13:23:20 +00:00
Tom
f51f5dcb42 new code 2025-02-06 13:18:31 +00:00
Tom
fcdf4e0d51 Simplify webapp and handle both local dev and deployed URLs 2025-02-03 10:21:06 +00:00
James Hawkes
3dba4eaa5e
Update structured_stac.md 2025-01-15 12:52:00 +00:00
James Hawkes
257380c46d
Update README.md 2025-01-15 12:24:54 +00:00
Tom
f6088e9583
Update README.md 2025-01-15 12:24:05 +00:00
James Hawkes
deba35f71a
Update README.md 2025-01-15 12:23:43 +00:00
James Hawkes
ca6b1fa8f9
Update README.md 2025-01-15 12:22:49 +00:00
James Hawkes
07b2c37aed
Add license 2025-01-15 12:22:00 +00:00
Tom
9c4af79640
Update README.md 2025-01-15 12:21:53 +00:00
Tom Hodson
35b54c9f7e small changes 2024-12-13 14:48:33 +00:00
Tom Hodson
01729a323a Working climate-dt deployment 2024-12-13 14:48:33 +00:00
Tom
b679402a1b delete load data job yaml 2024-12-12 10:39:28 +00:00
127 changed files with 25444 additions and 5620 deletions

188
.github/workflows/build_wheels.yml vendored Normal file
View File

@ -0,0 +1,188 @@
# This file is autogenerated by maturin v1.7.7
# To update, run
#
# maturin generate-ci github
#
name: Build Python Wheels and push to PyPI
on:
release:
types: [published]
permissions:
contents: read
jobs:
linux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-22.04
target: x86_64
- runner: ubuntu-22.04
target: x86
- runner: ubuntu-22.04
target: aarch64
- runner: ubuntu-22.04
target: armv7
- runner: ubuntu-22.04
target: s390x
- runner: ubuntu-22.04
target: ppc64le
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Set cargo version from tag
run: python .github/workflows/update_version.py
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-linux-${{ matrix.platform.target }}
path: dist
musllinux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-22.04
target: x86_64
- runner: ubuntu-22.04
target: x86
- runner: ubuntu-22.04
target: aarch64
- runner: ubuntu-22.04
target: armv7
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Set cargo version from tag
run: python .github/workflows/update_version.py
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
manylinux: musllinux_1_2
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-musllinux-${{ matrix.platform.target }}
path: dist
windows:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: windows-latest
target: x64
- runner: windows-latest
target: x86
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
architecture: ${{ matrix.platform.target }}
- name: Set cargo version from tag
run: python .github/workflows/update_version.py
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-windows-${{ matrix.platform.target }}
path: dist
macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-13
target: x86_64
- runner: macos-14
target: aarch64
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Set cargo version from tag
run: python .github/workflows/update_version.py
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}
path: dist
sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set cargo version from tag
run: python .github/workflows/update_version.py
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist
- name: Upload sdist
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
path: dist
release:
name: Release
runs-on: ubuntu-latest
if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
needs: [linux, musllinux, windows, macos, sdist]
permissions:
# Use to sign the release artifacts
id-token: write
# Used to upload release artifacts
contents: write
# Used to generate artifact attestation
attestations: write
steps:
- uses: actions/download-artifact@v4
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-path: 'wheels-*/*'
- name: Publish to PyPI
if: ${{ startsWith(github.ref, 'refs/tags/') }}
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
with:
command: upload
args: --non-interactive --skip-existing wheels-*/*

34
.github/workflows/test.yml vendored Normal file
View File

@ -0,0 +1,34 @@
# This file is autogenerated by maturin v1.7.7
# To update, run
#
# maturin generate-ci github
#
name: Test
on:
push:
branches:
- main
- develop
pull_request:
workflow_dispatch:
permissions:
contents: read
jobs:
linux:
runs-on:
- ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build and Install
run: |
python -m pip install ".[dev]"
- name: Test
run: |
pytest

42
.github/workflows/update_version.py vendored Executable file
View File

@ -0,0 +1,42 @@
import re
import subprocess
from pathlib import Path
CARGO_TOML_PATH = Path("Cargo.toml")
# Get the latest Git tag and strip the leading 'v' if present
def get_git_version():
try:
version = subprocess.check_output(
["git", "describe", "--tags", "--always"], text=True
).strip()
version = re.sub(r"^v", "", version) # Remove leading 'v'
return version
except subprocess.CalledProcessError:
raise RuntimeError(
"Failed to get Git tag. Make sure you have at least one tag in the repository."
)
# Update version in Cargo.toml
def update_cargo_version(new_version):
cargo_toml = CARGO_TOML_PATH.read_text()
# Replace version in [package] section
updated_toml = re.sub(
r'^version = "[^"]+"',
f'version = "{new_version}"',
cargo_toml,
flags=re.MULTILINE,
)
CARGO_TOML_PATH.write_text(updated_toml)
if __name__ == "__main__":
version = get_git_version()
print(f"Parsed version: {version}")
update_cargo_version(version)
print(f"Updated Cargo.toml with version: {version}")

18
.gitignore vendored
View File

@ -4,4 +4,20 @@ config.yaml
.venv
*.json
raw_list
*.egg-info/
*.egg-info/
deps/
docs/_build/
docs/jupyter_execute
target/
*.so
_build/
build/
.ipynb_checkpoints/
dist/
Cargo.lock
src/python/qubed/_version.py
*.ipynb
cmake_build/
tests/data/
*.secret
node_modules/

17
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,17 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
# - id: check-yaml
# - id: check-added-large-files
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.7
hooks:
- id: ruff
args: [ --fix ]
- id: ruff-format

29
.readthedocs.yaml Normal file
View File

@ -0,0 +1,29 @@
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the version of Python and other tools you might need
build:
os: ubuntu-20.04
tools:
python: "3.12"
rust: latest
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
# If using Sphinx, optionally build your docs in additional formats such as PDF
# formats:
# - pdf
python:
install:
- requirements: docs/requirements.txt
- method: pip
path: .
extra_requirements:
- docs

27
Cargo.toml Normal file
View File

@ -0,0 +1,27 @@
[package]
name = "qubed"
version = "0.0.0"
edition = "2021"
repository = "https://github.com/ecmwf/qubed"
[dependencies]
# rsfdb = {git = "https://github.com/ecmwf/rsfdb", branch = "develop"}
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
pyo3 = "0.25"
lasso = "0.7.3"
itertools = "0.14.0"
[package.metadata.maturin]
version-from-git = true
[lib]
name = "tree_traverser"
crate-type = ["cdylib"]
path = "./src/rust/lib.rs"
# [patch.'https://github.com/ecmwf/rsfdb']
# rsfdb = { path = "../rsfdb" }
# [patch.'https://github.com/ecmwf-projects/rsfindlibs']
# rsfindlibs = { path = "../rsfindlibs" }

201
LICENSE.txt Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2020 European Centre for Medium-Range Weather Forecasts
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -1,6 +1,41 @@
# Q<sup>3</sup> Quick Querying of Qubes
# <p align="center"><img src="https://raw.githubusercontent.com/ecmwf/qubed/refs/heads/main/docs/_static/banner.svg" width="1000"></p>
[![Static Badge](https://github.com/ecmwf/codex/raw/refs/heads/main/Project%20Maturity/emerging_badge.svg)](https://github.com/ecmwf/codex/raw/refs/heads/main/Project%20Maturity#emerging)
[![Docs](https://readthedocs.org/projects/qubed/badge/?version=latest)](https://qubed.readthedocs.io/en/latest/)
[![PyPi](https://img.shields.io/pypi/v/qubed.svg)](https://pypi.org/project/qubed/)
[![Wheel](https://img.shields.io/pypi/wheel/qubed.svg)](https://pypi.org/project/qubed/)
This repostitory contains a collection of components designed to deliver user friendly cataloging for ecmwf's data. The STAC Server, Frontend and a periodic job to do tree compression can be deployed together to kubernetes using the [helm chart](./helm_chart). Thise deployment can then be accessed either via the Query Builder Web interface or the python client.
Qubed provides a datastructure primitive for working with trees of DataCubes. If a normal tree looks like this:
```
root
├── class=od
│ ├── expver=0001
│ │ ├── param=1
│ │ └── param=2
│ └── expver=0002
│ ├── param=1
│ └── param=2
└── class=rd
├── expver=0001
│ ├── param=1
│ ├── param=2
│ └── param=3
└── expver=0002
├── param=1
└── param=2
```
A compressed view of the same set would be:
```
root
├── class=od, expver=0001/0002, param=1/2
└── class=rd
├── expver=0001, param=1/2/3
└── expver=0002, param=1/2
```
Qubed provides all the algorithms on this data structure you would expect such as intersection/union/difference, compression, search, filtering etc.
In addition to this core datastructure, this repostitory contains a collection of components designed to deliver user friendly cataloging for datacube data. The STAC Server, Frontend and a periodic job to do tree compression can be deployed together to kubernetes using the [helm chart](./helm_chart). Thise deployment can then be accessed either via the Query Builder Web interface or the python client.
## 📦 Components Overview
@ -10,8 +45,8 @@ This repostitory contains a collection of components designed to deliver user fr
- 🌟 Implements our proposed [Datacube STAC Extension](./structured_stac.md).
- 🛠️ Allows efficient traversal of ECMWF's datacubes.
- Part of the implementation of this is [🌲 Tree Compressor](./tree_compresser), a **compressed tree representation** optimised for storing trees with many duplicated subtress.
- 🔗 **[Live Example]()**.
- Part of the implementation of this is [🌲 Tree Compressor](./tree_compresser), a **compressed tree representation** optimised for storing trees with many duplicated subtress.
- 🔗 **[Live Example](https://qubed.lumi.apps.dte.destination-earth.eu/api/v1/stac/climate-dt/?class=od%2Cd1&dataset=climate-dt)**.
---
@ -19,11 +54,11 @@ This repostitory contains a collection of components designed to deliver user fr
> **Web Frontend**
- 👀 Displays data from the **STAC Server** in an intuitive user interface.
- 🌍 **[Try the Live Demo](http://catalogue.lumi.apps.dte.destination-earth.eu/)**.
- 🌍 **[Try the Live Demo](https://qubed.lumi.apps.dte.destination-earth.eu/)**.
---
### TODO: 🐍 [Qubed Python Query Builder](./python_query_builder)
### TODO: 🐍 [Qubed Python Query Builder](./python_query_builder)
> **Python Client**
- 🤖 A Python client for the **STAC Server**.

50
ROADMAP.md Normal file
View File

@ -0,0 +1,50 @@
Initial Python Implementation
[x] Basic Qube datastructure
[x] Compression
[x] Set Operations (Union, Difference, Intersection...)
[x] Query with request
[x] Iteration over leaves
[x] Iteration over datacubes
[x] Command line creation from fdb list --compact
[ ] Set up periodic updates to climate-dt/extremes-dt again
[ ] Maybe also do production db?
[ ] Do mars list to contraints conversion
[ ] protobuf serialization
Rust port
[ ] Initial object
[ ] Sort out ownership issues, (one arena owned by python object)
[ ] Compression
[ ] Set Operations
[ ] Query with request
[ ] Iteration over leaves
[ ] Iteration over datacubes
[ ] Set up periodic updates to climate-dt/extremes-dt again
## API
Qubed will provide a core compressed tree data structure called a Qube with:
Methods to convert to and from:
- [x] A human readable representation like those seen above.
- [x] An HTML version where subtrees can be collapsed.
- [ ] An compact protobuf-based binary format
- [x] Nested python dictionaries or JSON
- [/] The output of [fdb list](https://confluence.ecmwf.int/display/FDB/fdb-list)
- [ ] [mars list][mars list]
- [ ] [constraints.json][constraints]
[constraints]: https://object-store.os-api.cci2.ecmwf.int/cci2-prod-catalogue/resources/reanalysis-era5-land/constraints_a0ae5b42d67869674e13fba9fd055640bcffc37c24578be1f465d7d5ab2c7ee5.json
[mars list]: https://git.ecmwf.int/projects/CDS/repos/cads-forms-reanalysis/browse/reanalysis-era5-single-levels/gecko-config/mars.list?at=refs%2Fheads%2Fprod
Useful algorithms:
- [x] Compression
- [/] Union/Intersection/Difference
Performant Membership Queries
- Identifier membership
- Datacube query (selection)
Metadata Storage

View File

@ -1,6 +1,6 @@
apiVersion: v2
name: stac-server
description: A Helm chart for the STAC Server with frontend, STAC API and caching service.
name: qubed
description: A Helm chart for the STAC Server with frontend, STAC API and caching service.
type: application
version: 0.1.0
appVersion: "0.1.0"

View File

@ -1,4 +1,4 @@
{{- if .Values.stacServer.ingress.enabled }}
{{- if .Values.ingress.enabled }}
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
@ -6,9 +6,10 @@ metadata:
spec:
ingressClassName: nginx
rules:
- host: {{ .Values.stacServer.ingress.hostname }}
- host: {{ .Values.ingress.hostname }}
http:
paths:
{{- if .Values.stacServer.enabled }}
- path: /
pathType: Prefix
backend:
@ -16,12 +17,9 @@ spec:
name: stac-server
port:
number: {{ .Values.stacServer.servicePort }}
{{- end }}
tls:
- hosts:
- {{ .Values.stacServer.ingress.hostname }}
secretName: lumi-wildcard-tls
- {{ .Values.ingress.hostname }}
secretName: {{ .Values.ingress.tlsSecretName }}
{{- end }}

View File

@ -1,28 +0,0 @@
# templates/redis-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
spec:
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: "redis:alpine"
command: ["redis-server", "--appendonly", "yes"]
ports:
- containerPort: {{ .Values.redis.servicePort }}
# volumeMounts:
# - mountPath: /data
# name: redis-data
# volumes:
# - name: redis-data
# persistentVolumeClaim:
# claimName: redis-data

View File

@ -1,33 +0,0 @@
# apiVersion: batch/v1
# kind: Job
# metadata:
# name: redis-load-data
# annotations:
# "helm.sh/hook": post-install,post-upgrade
# "helm.sh/hook-weight": "1"
# "helm.sh/hook-delete-policy": before-hook-creation
# spec:
# template:
# spec:
# containers:
# - name: load-data
# image: redis:alpine
# command:
# - sh
# - -c
# - |
# # Wait for Redis to be ready
# until redis-cli -h redis ping | grep PONG; do
# echo "Waiting for Redis...";
# sleep 2;
# done;
# # Load data into Redis
# redis-cli -h redis set compressed_catalog "$(cat /data/compressed_tree.json)"
# volumeMounts:
# - name: redis-init-data
# mountPath: /data
# volumes:
# - name: redis-init-data
# configMap:
# name: redis-init-data
# restartPolicy: OnFailure

View File

@ -1,14 +0,0 @@
# templates/redis-pvc.yaml
{{- if .Values.redis.pvc.enabled }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: redis-data
spec:
accessModes: {{ .Values.redis.pvc.accessModes }}
resources:
requests:
storage: {{ .Values.redis.pvc.size }}
storageClassName: {{ .Values.redis.pvc.storageClassName | quote }}
{{- end }}

View File

@ -1,11 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: redis
spec:
selector:
app: redis
ports:
- protocol: TCP
port: {{ .Values.redis.servicePort }}
targetPort: {{ .Values.redis.servicePort }}

View File

@ -5,7 +5,7 @@ kind: Deployment
metadata:
name: stac-server
spec:
replicas: 1 # Adjust as needed
replicas: {{ .Values.stacServer.replicas }}
selector:
matchLabels:
app: stac-server
@ -14,20 +14,30 @@ spec:
labels:
app: stac-server
spec:
initContainers:
- name: wait-for-redis
image: busybox
command:
[
'sh', '-c',
'until nc -z -v -w30 {{ .Values.stacServer.environment.REDIS_HOST }} {{ .Values.redis.service.port }}; do echo "Waiting for Redis..."; sleep 5; done;'
]
containers:
- name: stac-server
image: "{{ .Values.stacServer.image.repository }}:{{ .Values.stacServer.image.tag }}"
imagePullPolicy: {{ .Values.stacServer.image.pullPolicy }}
env:
- name: REDIS_HOST
value: "{{ .Values.stacServer.environment.REDIS_HOST }}"
- name: API_KEY
valueFrom:
secretKeyRef:
name: api-key
key: API_KEY
- name: API_URL
value: "https://{{ .Values.ingress.hostname }}/api/v1/"
ports:
- containerPort: {{ .Values.stacServer.servicePort }}
---
apiVersion: v1
kind: Service
metadata:
name: stac-server
spec:
selector:
app: stac-server
ports:
- protocol: TCP
port: {{ .Values.stacServer.servicePort }}
targetPort: {{ .Values.stacServer.servicePort }}
type: ClusterIP

View File

@ -1,12 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: stac-server
spec:
selector:
app: stac-server
ports:
- protocol: TCP
port: {{ .Values.stacServer.servicePort }}
targetPort: {{ .Values.stacServer.servicePort }}
type: ClusterIP

View File

@ -1,25 +1,13 @@
# values.yaml
redis:
servicePort: 6379
pvc:
enabled: true
storageClassName: ""
accessModes:
- ReadWriteOnce
size: 1Gi
service:
port: 6379
# See https://eccr.ecmwf.int/harbor/projects/258/repositories
stacServer:
enabled: true
replicas: 1
image:
repository: "eccr.ecmwf.int/qubed/stac_server"
tag: "latest"
pullPolicy: IfNotPresent
servicePort: 8080
environment:
REDIS_HOST: "redis"
ingress:
enabled: True
hostname: "climate-catalogue.lumi.apps.dte.destination-earth.eu"
pullPolicy: Always
servicePort: 80
ingress:
enabled: True
tlsSecretName: "lumi-wildcard-tls"
hostname: "qubed.lumi.apps.dte.destination-earth.eu"

View File

@ -1,16 +1,5 @@
services:
# redis server holds the catalog data blob
redis:
image: redis:alpine
container_name: redis
command: ["redis-server", "--appendonly", "yes"]
volumes:
- ./redis-data:/data
ports:
- "6379:6379"
restart: always
# STAC Server
stac_server:
# image: stac-server:latest
@ -20,27 +9,24 @@ services:
dockerfile: Dockerfile
target: stac_server
ports:
- "8124:8080"
- "8124:80"
environment:
- REDIS_HOST=redis
- CONFIG_DIR=/config
volumes:
- ./stac_server:/code/stac_server
- ./TreeTraverser:/code/TreeTraverser
# restart: always
# web_query_builder:
# # image: web_query_builder:latest
# container_name: web_query_builder
# build:
# context: .
# dockerfile: Dockerfile
# target: web_query_builder
# ports:
# - "8125:8080"
# environment:
# - CONFIG_DIR=/config
# volumes:
# - ./web_query_builder:/code/web_query_builder
# restart: always
web_query_builder:
# image: web_query_builder:latest
container_name: web_query_builder
build:
context: .
dockerfile: Dockerfile
target: web_query_builder
ports:
- "8125:80"
environment:
- API_URL=http://127.0.0.1:8124/api/v1/stac/climate-dt
volumes:
- ./web_query_builder:/code/web_query_builder
restart: always

View File

@ -0,0 +1,6 @@
---
type: remote
host: databridge-prod-catalogue3-ope.ewctest.link
port: 10000
engine: remote
store: remote

View File

@ -0,0 +1,6 @@
---
type: remote
host: databridge-prod-catalogue1-ope.ewctest.link
port: 10000
engine: remote
store: remote

File diff suppressed because it is too large Load Diff

View File

@ -140,7 +140,7 @@ _field: &_field
- [lwda, long window daily archive] # extremes-dt
- [lwwv, long window wave] # extremes-dt
- [clmn, climate-monthly, Climate run monthly means output] # climate-dt
# - [amap, analysis for multianalysis project]
# - [ammc, melbourne]
# - [cher, ch, chernobyl]
@ -468,7 +468,7 @@ _field: &_field
type: enum
multiple: true
values:
- [20211021, ]
- [20211021, ]
year:
category: data

18996
config/language/paramids.yaml Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,590 +0,0 @@
# * Format of the rules is:
# [a1, a2, a3 ...[b1, b2, b3... [c1, c2, c3...]]]
# - The first level (a) defines which attributes are used to name the top level directory
# - The second level (b) defines which attributes are used to name the data files
# - The third level (c) defines which attributes are used as index keys
# * Rules can be grouped
# [a1, a2, a3 ...
# [b1, b2, b3... [c1, c2, c3...]]
# [B1, B2, B3... [C1, C2, C3...]]
# ]
# * A list of values can be given for an attribute
# [ ..., stream=enfo/efov, ... ]
# This will be used when matching rules.
# * Attributes can be typed
# Globally, at the begining of this file:
# refdate: Date;
# or in the context of a rule:
# [type=cl, ... [date:ClimateMonth, ...]]
# Typing attributes is done when the user's requests or the GRIB values need to be modified before directories, files and indexes are created. For example, ClimateMonth will transform 2010-04-01 to 'may' internally.
# * Attributes can be optional
# [ step, levelist?, param ]
# They will be replaced internally by an empty value. It is also posiible to provide a default subtitution value: e.g. [domain?g] will consider the domain to be 'g' if missing.
# * Attributes can be removed:
# [grid-]
# This is useful to remove attributes present in the GRIB that should not be ignored
# * Rules are matched:
# - If the attributes are present in the GRIB/Request, or marked optional or ignored
# - If a list of possible value is provided, one of them must match, for example
# [ class, expver, stream=enfo/efov, date, time, domain ]
# will match either stream=enfo or stream=efov, all other attributes will be matched if they exist in the GRIB or user's request
# * On archive:
# - Attributes are extracted from the GRIB (namespace 'mars'), possibly modified by the attribute type
# - Only the first rule is used, so order is important
# - All GRIB attributes must be used by the rules, otherwise an error is raised
# * On retrieve:
# - Attributes are extracted from the user's request, possibly modified by the attribute type (e.g. for handling of U/V)
# - All the matching rules are considered
# - Only attributes listed in the rules are used to extract values from the user's request
# Default types
param: Param;
step: Step;
date: Date;
hdate: Date;
refdate: Date;
latitude: Double;
longitude: Double;
levelist: Double;
grid: Grid;
expver: Expver;
time: Time;
fcmonth: Integer;
number: Integer;
frequency: Integer;
direction: Integer;
channel: Integer;
instrument: Integer;
ident: Integer;
diagnostic: Integer;
iteration: Integer;
system: Integer;
method: Integer;
# ???????
# reference: Integer;
# fcperiod: Integer;
# opttime: Integer;
# leadtime: Integer;
# quantile: ??????
# range: ??????
# band: Integer;
########################################################
# These rules must be first, otherwise fields of These
# classes will be index with the default rule for oper
[ class=ti/s2, expver, stream, date, time, model
[ origin, type, levtype, hdate?
[ step, number?, levelist?, param ]]
]
[ class=ms, expver, stream, date, time, country=de
[ domain, type, levtype, dbase, rki, rty, ty
[ step, levelist?, param ]]
]
[ class=ms, expver, stream, date, time, country=it
[ domain, type, levtype, model, bcmodel, icmodel:First3
[ step, levelist?, param ]
]
]
[ class=el, expver, stream, date, time, domain
[ origin, type, levtype
[ step, levelist?, param ]]
]
########################################################
# The are the rules matching most of the fields
# oper/dcda
[ class, expver, stream=oper/dcda/scda, date, time, domain?
[ type=im/sim
[ step?, ident, instrument, channel ]]
[ type=ssd
[ step, param, ident, instrument, channel ]]
[ type=4i, levtype
[ step, iteration, levelist, param ]]
[ type=me, levtype
[ step, number, levelist?, param ]]
[ type=ef, levtype
[ step, levelist?, param, channel? ]]
[ type=ofb/mfb
[ obsgroup, reportype ]]
[ type, levtype
[ step, levelist?, param ]]
]
# dcwv/scwv/wave
[ class, expver, stream=dcwv/scwv/wave, date, time, domain
[ type, levtype
[ step, param, frequency?, direction? ]]]
# enfo
[ class, expver, stream=enfo/efov, date, time, domain
[ type, levtype=dp, product?, section?
[ step, number?, levelist?, latitude?, longitude?, range?, param ]]
[ type=tu, levtype, reference
[ step, number, levelist?, param ]]
[ type, levtype
[ step, quantile?, number?, levelist?, param ]]
]
# waef/weov
[ class, expver, stream=waef/weov, date, time, domain
[ type, levtype
[ step, number?, param, frequency?, direction? ]]
]
########################################################
# enda
[ class, expver, stream=enda, date, time, domain
[ type=ef/em/es/ses, levtype
[ step, number?, levelist?, param, channel? ]]
[ type=ssd
[ step, number, param, ident, instrument, channel ]]
[ type, levtype
[ step, number?, levelist?, param ]]
]
# ewda
[ class, expver, stream=ewda, date, time, domain
[ type, levtype
[ step, number?, param, frequency?, direction? ]]
]
########################################################
# elda
[ class, expver, stream=elda, date, time, domain?
[ type=ofb/mfb
[ obsgroup, reportype ]]
[ type, levtype, anoffset
[ step, number?, levelist?, iteration?, param, channel? ]]
]
# ewda
[ class, expver, stream=ewla, date, time, domain
[ type, levtype, anoffset
[ step, number?, param, frequency?, direction? ]]
]
########################################################
# elda
[ class, expver, stream=lwda, date, time, domain?
[ type=ssd, anoffset
[ step, param, ident, instrument, channel ]]
[type=me, levtype, anoffset
[ number, step, levelist?, param]]
[ type=4i, levtype, anoffset
[ step, iteration, levelist, param ]]
[ type=ofb/mfb
[ obsgroup, reportype ]]
[ type, levtype, anoffset
[ step, levelist?, param]]
]
# ewda
[ class, expver, stream=lwwv, date, time, domain
[ type, levtype, anoffset
[ step, param, frequency?, direction? ]]
]
########################################################
# amap
[ class, expver, stream=amap, date, time, domain
[ type, levtype, origin
[ step, levelist?, param ]]]
# maed
[ class, expver, stream=maed, date, time, domain
[ type, levtype, origin
[ step, levelist?, param ]]]
# mawv
[ class, expver, stream=mawv, date, time, domain
[ type, levtype, origin
[ step, param, frequency?, direction? ]]]
# cher
[ class, expver, stream=cher, date, time, domain
[ type, levtype
[ step, levelist, param ]]]
# efhc
[ class, expver, stream=efhc, refdate, time, domain
[ type, levtype, date
[ step, number?, levelist?, param ]]]
# efho
[ class, expver, stream=efho, date, time, domain
[ type, levtype, hdate
[ step, number?, levelist?, param ]]]
# efhs
[ class, expver, stream=efhs, date, time, domain
[ type, levtype
[ step, quantile?, number?, levelist?, param ]]]
# wehs
[ class, expver, stream=wehs, date, time, domain
[ type, levtype
[ step, quantile?, number?, levelist?, param ]]]
# kwbc
[ class, expver, stream=kwbc, date, time, domain
[ type, levtype
[ step, number?, levelist?, param ]]]
# ehmm
[ class, expver, stream=ehmm, date, time, domain
[ type, levtype, hdate
[ fcmonth, levelist?, param ]]]
# ammc/cwao/edzw/egrr/lfpw/rjtd/toga
[ class, expver, stream=ammc/cwao/edzw/egrr/lfpw/rjtd/toga/fgge, date, time, domain
[ type, levtype
[ step, levelist?, param ]]]
########################################################################
# enfh
[ class, expver, stream=enfh, date, time, domain
[ type, levtype=dp, hdate, product?, section?
[ step, number?, levelist?, latitude?, longitude?, range?, param ]]
[ type, levtype, hdate
[ step, number?, levelist?, param ]]
]
# enwh
[ class, expver, stream=enwh, date, time, domain
[ type, levtype, hdate
[ step, number?, param, frequency?, direction? ]]
]
########################################################################
# sens
[ class, expver, stream=sens, date, time, domain
[ type, levtype
[ step, diagnostic, iteration, levelist?, param ]]]
########################################################################
# esmm
[ class, expver, stream=esmm, date, time, domain
[ type, levtype
[ fcmonth, levelist?, param ]]]
# ewhc
[ class, expver, stream=ewhc, refdate, time, domain
[ type, levtype, date
[ step, number?, param, frequency?, direction? ]]]
########################################################################
# ewho
[ class, expver, stream=ewho, date, time, domain
[ type, levtype, hdate
[ step, number?, param, frequency?, direction? ]]]
# mfam
[ class, expver, stream=mfam, date, time, domain
[ type=pb/pd, levtype, origin, system?, method
[ fcperiod, quantile, levelist?, param ]]
[ type, levtype, origin, system?, method
[ fcperiod, number?, levelist?, param ]]
]
# mfhm
[ class, expver, stream=mfhm, refdate, time, domain
[ type, levtype, origin, system?, method, date?
[ fcperiod, number?, levelist?, param ]]]
# mfhw
[ class, expver, stream=mfhw, refdate, time, domain
[ type, levtype, origin, system?, method, date
[ step, number?, param ]]]
# mfwm
[ class, expver, stream=mfwm, date, time, domain
[ type, levtype, origin, system?, method
[ fcperiod, number, param ]]]
# mhwm
[ class, expver, stream=mhwm, refdate, time, domain
[ type, levtype, origin, system?, method, date
[ fcperiod, number, param ]]]
# mmsf
[ class, expver, stream=mmsf, date, time, domain
[ type, levtype=dp, origin, product, section, system?, method
[ step, number, levelist?, latitude?, longitude?, range?, param ]]
[ type, levtype, origin, system?, method
[ step, number, levelist?, param ]]
]
# mnfc
[ class, expver, stream=mnfc, date, time, domain
[ type, levtype=dp, origin, product, section, system?, method
[ step, number?, levelist?, latitude?, longitude?, range?, param ]]
[ type, levtype, origin, system?, method
[ step, number?, levelist?, param ]]
]
# mnfh
[ class, expver, stream=mnfh, refdate, time, domain
[ type, levtype=dp, origin, product, section, system?, method, date
[ step, number?, levelist?, latitude?, longitude?, range?, param ]]
[ type, levtype, origin, system?, method, date?
[ step, number?, levelist?, param ]]
]
# mnfm
[ class, expver, stream=mnfm, date, time, domain
[ type, levtype, origin, system?, method
[ fcperiod, number?, levelist?, param ]]]
# mnfw
[ class, expver, stream=mnfw, date, time, domain
[ type, levtype, origin, system?, method
[ step, number?, param ]]]
# ea/mnth
[ class=ea, expver, stream=mnth, date, domain
[ type, levtype
[ time, step?, levelist?, param ]]]
# mnth
[ class, expver, stream=mnth, domain
[ type=cl, levtype
[ date: ClimateMonthly, time, levelist?, param ]]
[ type, levtype
[ date , time, step?, levelist?, param ]]]
# mofc
[ class, expver, stream=mofc, date, time, domain
[ type, levtype=dp, product, section, system?, method
[ step, number?, levelist?, latitude?, longitude?, range?, param ]]
[ type, levtype, system?, method
[ step, number?, levelist?, param ]]
]
# mofm
[ class, expver, stream=mofm, date, time, domain
[ type, levtype, system?, method
[ fcperiod, number, levelist?, param ]]]
# mmsa/msmm
[ class, expver, stream=mmsa, date, time, domain
[ type, levtype, origin, system?, method
[ fcmonth, number?, levelist?, param ]]]
[ class, expver, stream=msmm, date, time, domain
[ type, levtype, origin, system?, method
[ fcmonth, number?, levelist?, param ]]]
# ocea
[ class, expver, stream=ocea, date, time, domain
[ type, levtype, product, section, system?, method
[ step, number, levelist?, latitude?, longitude?, range?, param ]]
]
#=# seas
[ class, expver, stream=seas, date, time, domain
[ type, levtype=dp, product, section, system?, method
[ step, number, levelist?, latitude?, longitude?, range?, param ]]
[ type, levtype, system?, method
[ step, number, levelist?, param ]]
]
# sfmm/smma
[ class, expver, stream=sfmm/smma, date, time, domain
[ type, levtype, system?, method
[ fcmonth, number?, levelist?, param ]]]
# supd
[ class=od, expver, stream=supd, date, time, domain
[ type, levtype, origin?, grid
[ step, levelist?, param ]]]
# For era
[ class, expver, stream=supd, date, time, domain
[ type, levtype, grid- # The minus sign is here to consume 'grid', but don't index it
[ step, levelist?, param ]]]
# swmm
[ class, expver, stream=swmm, date, time, domain
[ type, levtype, system?, method
[ fcmonth, number, param ]]]
# wamf
[ class, expver, stream=wamf, date, time, domain
[ type, levtype, system?, method
[ step, number?, param ]]]
# ea/wamo
[ class=ea, expver, stream=wamo, date, domain
[ type, levtype
[ time, step?, param ]]]
# wamo
[ class, expver, stream=wamo, domain
[ type=cl, levtype
[ date: ClimateMonthly, time, param ]]
[ type, levtype
[ date, time, step?, param ]]]
# wamd
[ class, expver, stream=wamd, date, domain
[ type, levtype
[ param ]]]
# wasf
[ class, expver, stream=wasf, date, time, domain
[ type, levtype, system?, method
[ step, number, param ]]]
# wmfm
[ class, expver, stream=wmfm, date, time, domain
[ type, levtype, system?, method
[ fcperiod, number, param ]]]
# moda
[ class, expver, stream=moda, date, domain
[ type, levtype
[ levelist?, param ]]]
# msdc/mdfa/msda
[ class, expver, stream=msdc/mdfa/msda, domain
[ type, levtype
[ date, time?, step?, levelist?, param ]]]
# seap
[ class, expver, stream=seap, date, time, domain
[ type=sv/svar, levtype, origin, method?
[ step, leadtime, opttime, number, levelist?, param ]]
[ type=ef, levtype, origin
[ step, levelist?, param, channel? ]]
[ type, levtype, origin
[ step, levelist?, param ]]
]
[ class, expver, stream=mmaf, date, time, domain
[ type, levtype, origin, system?, method
[ step, number, levelist?, param ]]
]
[ class, expver, stream=mmam, date, time, domain
[ type, levtype, origin, system?, method
[ fcmonth, number, levelist?, param ]]
]
[ class, expver, stream=dacl, domain
[ type=pb, levtype
[ date: ClimateDaily, time, step, quantile, levelist?, param ]]
[ type, levtype
[ date: ClimateDaily, time, step, levelist?, param ]]
]
[ class, expver, stream=dacw, domain
[ type=pb, levtype
[ date: ClimateDaily, time, step, quantile, param ]]
[ type, levtype
[ date: ClimateDaily, time, step, param ]]
]
[ class, expver, stream=edmm/ewmm, date, time, domain
[ type=ssd
[ step, number, param, ident, instrument, channel ]]
[ type, levtype
[ step, number, levelist?, param ]]
]
[ class, expver, stream=edmo/ewmo, date, domain
[ type, levtype
[ number, levelist?, param ]]
]
# stream gfas
[ class=mc/rd, expver, stream=gfas, date, time, domain
[ type=ga, levtype
[ step, param ]]
[ type=gsd
[ param, ident, instrument ]]
]
# class is e2
[ class, expver, stream=espd, date, time, domain
[ type, levtype, origin, grid
[ step, number, levelist?, param ]]]
[ class=cs, expver, stream, date:Default, time, domain
[ type, levtype
[ step, levelist?, param ]]]

View File

@ -1,11 +0,0 @@
[ class=od, stream, date, time
[ domain, type, levtype, dbase, rki, rty, ty
[ step, levelist?, param ]]
]
[ class=ensemble, number, stream, date, time,
[ domain, type, levtype, dbase, rki, rty, ty
[ step, levelist?, param ]]
]
[ class, foo]

View File

@ -1,12 +1,15 @@
FROM python:3.12-slim AS stac_server
FROM python:3.12-slim AS base
RUN apt-get update && apt-get install -y \
build-essential \
curl \
openssh-client \
openssh-client \
git \
&& apt-get clean
RUN pip install uv
# Allows cloning private repos using RUN --mount=type=ssh git clone
RUN mkdir -p -m 0600 ~/.ssh && \
ssh-keyscan -H github.com >> ~/.ssh/known_hosts
@ -15,20 +18,19 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /code
FROM base AS stac_server
COPY stac_server/requirements.txt /code/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
# Todo: don't embed this here, mount them at runtime
COPY config/destinE/schema /config/schema
COPY config/destinE/language.yaml /config/language.yaml
COPY ./src /code/qubed/src
COPY ./pyproject.toml /code/qubed/
COPY ./Cargo.toml /code/qubed/
COPY ./README.md /code/qubed/
COPY ./tree_compresser /code/tree_compresser
# Clone the rsfdb and rsfindlibs repos manually because they're private
RUN --mount=type=ssh git clone ssh://git@github.com/ecmwf/rsfdb.git
RUN --mount=type=ssh git clone ssh://git@github.com/ecmwf/rsfindlibs.git
RUN pip install --no-cache-dir -e /code/tree_compresser
RUN pip install --no-cache-dir -e /code/qubed
COPY ./stac_server /code/stac_server
WORKDIR /code/stac_server
CMD ["fastapi", "dev", "main.py", "--proxy-headers", "--port", "8080", "--host", "0.0.0.0"]
CMD ["fastapi", "dev", "main.py", "--proxy-headers", "--port", "80", "--host", "0.0.0.0"]

20
docs/Makefile Normal file
View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

159
docs/_static/banner.svg vendored Normal file
View File

@ -0,0 +1,159 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="1000"
height="200"
viewBox="0 0 264.58333 52.916666"
version="1.1"
id="svg5"
xml:space="preserve"
inkscape:version="1.2.2 (b0a84865, 2022-12-01)"
sodipodi:docname="banner.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg"><sodipodi:namedview
id="namedview7"
pagecolor="#ffffff"
bordercolor="#000000"
borderopacity="0.25"
inkscape:showpageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
inkscape:document-units="mm"
showgrid="false"
inkscape:zoom="1.4221154"
inkscape:cx="509.80392"
inkscape:cy="23.908046"
inkscape:window-width="2665"
inkscape:window-height="1000"
inkscape:window-x="96"
inkscape:window-y="35"
inkscape:window-maximized="0"
inkscape:current-layer="g330" /><defs
id="defs2"><rect
x="641.41612"
y="32.816639"
width="73.588826"
height="29.833308"
id="rect2775" /><rect
x="500.20513"
y="263.52755"
width="244.63313"
height="143.19988"
id="rect2749" /><rect
x="467.38849"
y="331.14972"
width="258.55534"
height="132.261"
id="rect2743" /><rect
x="80.859469"
y="61.833711"
width="299.65568"
height="114.15454"
id="rect242" /><rect
x="61.833711"
y="66.590151"
width="313.925"
height="114.15454"
id="rect236" /></defs><g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text234"
style="font-weight:500;font-size:20px;line-height:1.2;font-family:Futura;-inkscape-font-specification:'Futura, Medium';white-space:pre;shape-inside:url(#rect236);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text240"
style="font-weight:500;font-size:20px;line-height:1.2;font-family:Futura;-inkscape-font-specification:'Futura, Medium';white-space:pre;shape-inside:url(#rect242);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text2741"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:20px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;shape-inside:url(#rect2743);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text2747"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:20px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;shape-inside:url(#rect2749);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text2773"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:26.6667px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;shape-inside:url(#rect2775);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><g
id="g349"
transform="translate(-5.8208336)"><text
xml:space="preserve"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="114.48351"
y="14.228302"
id="text2763"><tspan
sodipodi:role="line"
id="tspan2761"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="14.228302">root</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="21.701376"
id="tspan2765">├── class=cd, stream=lwda/oai, param=1/2/3</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="29.17445"
id="tspan2767">├── class=od, expver=1/2, param=1/2</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="36.647522"
id="tspan2771">├── class=rd, param=1/2/3</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="44.120598"
id="tspan2769">└── ...</tspan></text><g
id="g330"><text
xml:space="preserve"
style="font-weight:500;font-size:14.1111px;line-height:0;font-family:Futura;-inkscape-font-specification:'Futura, Medium';white-space:pre;inline-size:112.115;display:inline;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="5.439929"
y="17.022402"
id="text248"
transform="translate(0,-1.5875)"><tspan
x="5.439929"
y="17.022402"
id="tspan532"><tspan
style="font-size:12.3472px;line-height:1.2"
id="tspan530">Qube</tspan></tspan></text><text
xml:space="preserve"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:5.29167px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;inline-size:87.6248;display:inline;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="4.209815"
y="40.519432"
id="text2755"
transform="translate(1.744648,-4.9844494)"><tspan
x="4.209815"
y="40.519432"
id="tspan534">1. A data structure for efficiently </tspan><tspan
x="4.209815"
y="46.604852"
id="tspan536">representing and querying complex </tspan><tspan
x="4.209815"
y="52.690271"
id="tspan538">tree-like datacubes.</tspan></text><text
xml:space="preserve"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:5.29167px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="5.4673572"
y="26.586193"
id="text2759"><tspan
sodipodi:role="line"
id="tspan2757"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="5.4673572"
y="26.586193">[kjuːb] <tspan
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
id="tspan495">noun</tspan></tspan></text><path
style="fill:#000000;stroke:#000000;stroke-width:0.445;stroke-miterlimit:4.9;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
d="M 6.0516036,18.417924 H 92.221177"
id="path2833"
sodipodi:nodetypes="cc" /></g></g></g></svg>

After

Width:  |  Height:  |  Size: 10 KiB

86
docs/algorithms.md Normal file
View File

@ -0,0 +1,86 @@
---
jupytext:
text_representation:
extension: .md
format_name: myst
format_version: 0.13
jupytext_version: 1.16.4
---
# Under the Hood
## Set Operations
Qubes represent sets of objects, so the familiar set operations:
* Union `A | B` or `Qube.union(A, B)`
* Intersection `A & B` or `Qube.intersection(A, B)`
* Difference (both `A - B` or `B - A`) or `Qube.difference(A, B)`
* Symmetric difference `A ^ B` or `Qube.symmetric_difference(A, B)`
are all defined.
We can implement these operations by breaking the problem down into a recursive function:
```python
def operation(A : Qube, B : Qube) -> Qube:
...
```
Consider the intersection of A and B:
```
A
├─── a=1, b=1/2/3, c=1
└─── a=2, b=1/2/3, c=1
B
├─── a=1, b=3/4/5, c=2
└─── a=2, b=3/4/5, c=2
```
We pair the two trees and traverse them in tandem, at each level we group the nodes by node key and for every pair of nodes in a group, compute the values only in A, the values only in B and the
```
for node_a in level_A:
for node_b in level_B:
just_A, intersection, just_B = Qube.fused_set_operations(
node_a.values,
node_b.values
)
```
Based on the particular operation we're computing we keep or discard these three objects:
* Union: keep just_A, intersection, just_B
* Intersection: keep intersection
* A - B: keep just_A, B - A keep just_B
* Symmetric difference: keep just_A and just_B but not intersection
The reason we have to keep just_A, intersection and just just_B separate is that each will produce a node with different children:
* just_B: the children of node_B
* just_A: the children of node_A
* intersection: the result of calling `operation(A, B)` recursively on two new nodes formed from A and B but with just the intersecting values.
This structure means that node.values can take different types, the two most useful being:
* an enum, just a set of values
* a range with start, stop and step
Qube.fused_set_operations can dispatch on the two types given in order to efficiently compute set/set, set/range and range/range intersection operations.
### Performance considerations
This algorithm is quadratic in the number of matching keys, this means that if we have a level with a huge number of nodes with key 'date' and range types (since range types are currently restricted to being contiguous) we could end up with a quadtratic slow down.
There are some ways this can be sped up:
* Once we know any of just_A, intersection or just_B are empty we can discard them. Only for quite pathological inputs (many enums sparse enums with a lot of overlap) would you actually get quadratically many non-empty terms.
* For ranges intersected with ranges, we could speed the algorithm up significantly by sorting the ranges and walking the two lists in tandem which reduces it to linear in the number of ranges.
* If we have N_A and N_B nodes to compare between the two trees we have N_A*N_B comparisons to do. However if at the end of the day we're just trying to determine for each value whether it's in A, B or both. If N_A*N_B >> M the number of value s we might be able to switch to an alternative algorithm.
## Compression
In order to keep the tree compressed as operations are performed on it we define the "structural hash" of a node to be the hash of:
* The node's key
* Not the node's values.
* The keys, values and children of the nodes children, recursively.
This structural hash lets us identify when two sibling nodes may be able to be merged into one node thus keeping the tree compressed.

5
docs/autobuild.sh Executable file
View File

@ -0,0 +1,5 @@
# cd to current directory of script
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
cd "$parent_path"
sphinx-autobuild . _build

87
docs/background.md Normal file
View File

@ -0,0 +1,87 @@
---
jupytext:
text_representation:
extension: .md
format_name: myst
format_version: 0.13
jupytext_version: 1.16.4
---
# Datacubes, Trees and Compressed trees
This section contains a bit more of an introduction to the datastructure, feel free to skip to the [Quickstart](quickstart.md). See the [datacube spec](https://github.com/ecmwf/datacube-spec), for even more detail and the canonical source of truth on the matter.
Qubed is primarily geared towards dealing with datafiles uniquely labeled by sets of key value pairs. We'll call a set of key value pairs that uniquely labels some data an `identifier`. Here's an example:
```python
{
'class': 'd1',
'dataset': 'climate-dt',
'generation': '1',
'date': '20241102',
'resolution': 'high',
'time': '0000',
}
```
Unfortunately, we have more than one data file. If we are lucky, the set of identifiers that current exists might form a dense datacube that we could represent like this:
```python
{
'class': ['d1', 'd2'],
'dataset': 'climate-dt',
'generation': ['1','2','3'],
'model': 'icon',
'date': ['20241102','20241103'],
'resolution': ['high','low'],
'time': ['0000', '0600', '1200', '1800'],
}
```
with the property that any particular choice for a value for any key will correspond to datafile that exists. So this object represents `2x1x3x1x2x2x4 = 96` different datafiles.
To save space I will also represent this same thing like this:
```
- class=d1/d2, dataset=climate-dt, generation=1/2/3, ..., time=0000/0600/1200/1800
```
Unfortunately, we are not lucky and our datacubes are not always dense. In this case we might instead represent which data exists using a tree:
```{code-cell} python3
from qubed import Qube
q = Qube.from_dict({
"class=od" : {
"expver=0001": {"param=1":{}, "param=2":{}},
"expver=0002": {"param=1":{}, "param=2":{}},
},
"class=rd" : {
"expver=0001": {"param=1":{}, "param=2":{}, "param=3":{}},
"expver=0002": {"param=1":{}, "param=2":{}},
},
})
# depth controls how much of the tree is open when rendered as html.
q.html(depth=100)
```
But it's clear that the above tree contains a lot of redundant information. Many of the subtrees are identical for example. Indeed in practice a lot of our data turns out to be 'nearly dense' in that it contains many dense datacubes within it.
There are many valid ways one could compress this tree. If we add the restriction that no identical key=value pairs can be adjacent then here is the compressed tree we might get:
```{code-cell} python3
q.compress()
````
```{warning}
Without the above restriction we could, for example, have:
root
├── class=od, expver=0001/0002, param=1/2
└── class=rd
├── expver=0001, param=3
└── expver=0001/0002, param=1/2
but we do not allow this because it would mean we would have to take multiple branches in order to find data with `expver=0001`.
```
What we have now is a tree of dense datacubes which represents a single larger sparse datacube in a more compact manner. For want of a better word we'll call it a Qube.

44
docs/cmd.md Normal file
View File

@ -0,0 +1,44 @@
# Command Line Usage
```bash
fdb list class=rd,expver=0001,... | qubed --from=fdblist --to=text
fdb list --minimum-keys=class class=d1,dataset=climate-dt --config prod_remoteFDB.yaml | qubed convert --from=fdb --to=text
```
`--from` options include:
* `fdb`
`--to` options include:
* `text`
* `html`
* `json`
use `--input` and `--output` to specify input and output files respectively.
There's some handy test data in the `tests/data` directory. For example:
```bash
gzip -dc tests/data/fdb_list_compact.gz| qubed convert --from=fdb --to=text --output=qube.txt
gzip -dc tests/data/fdb_list_porcelain.gz| qubed convert --from=fdb --to=json --output=qube.json
gzip -dc tests/data/fdb_list_compact.gz | qubed convert --from=fdb --to=html --output=qube.html
// Operational data stream=oper/wave/enfo/waef
fdb list class=od,expver=0001,date=0,stream=oper --compact >> operational_compact.txt
operational_compact.txt | qubed convert --from=fdb --to=text --output=operational.txt
```
## Todo
--from for
* `protobuf`
* `marslist`
* `constraints`
--to for
* `json`
* `datacubes`
* `constraints`

39
docs/conf.py Normal file
View File

@ -0,0 +1,39 @@
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "qubed"
copyright = "2025, Tom Hodson (ECMWF)"
author = "Tom Hodson (ECMWF)"
release = "0.1.0"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = [
"sphinx.ext.autodoc", # for generating documentation from the docstrings in our code
"sphinx.ext.napoleon", # for parsing Numpy and Google stye docstrings
"myst_nb", # For parsing markdown
]
templates_path = ["_templates"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "jupyter_execute"]
source_suffix = {
".rst": "restructuredtext",
}
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]
myst_enable_extensions = [
"attrs_inline",
]

21
docs/development.md Normal file
View File

@ -0,0 +1,21 @@
# Development
To install the latest stable release from PyPI (recommended):
```bash
pip install qubed
```
To install the latest version from github (requires rust):
```bash
pip install qubed@git+https://github.com/ecmwf/qubed.git@main
```
To build the develop branch from source install a rust toolchain and pip install maturin then run:
```
git clone -b develop git@github.com:ecmwf/qubed.git
cd qubed
maturin develop
```

137
docs/fiab.md Normal file
View File

@ -0,0 +1,137 @@
---
jupytext:
text_representation:
extension: .md
format_name: myst
format_version: 0.13
jupytext_version: 1.16.4
---
# Fiab
## Model Selection
This is a demo of using qubed to select from a set of forecast models that each produce a set of output variables.
First let's construct some models represented as qubes:
```{code-cell} python3
from qubed import Qube
model_1 = Qube.from_datacube({
"levtype": "pl",
"param" : ["q", "t", "u", "v", "w", "z"],
"level" : [100, 200, 300, 400, 50, 850, 500, 150, 600, 250, 700, 925, 1000],
}) | Qube.from_datacube({
"levtype": "sfc",
"param" : ["10u", "10v", "2d", "2t", "cp", "msl", "skt", "sp", "tcw", "tp"],
})
model_1 = "model=1" / ("frequency=6h" / model_1)
model_1
```
This is the most complete model. Now let's do one with fewer variables and levels:
```{code-cell} python3
model_2 = Qube.from_datacube({
"levtype": "pl",
"param" : ["q", "t"],
"level" : [100, 200, 300, 400, 50, 850, 500, 150, 600, 250, 700, 925, 1000],
}) | Qube.from_datacube({
"levtype": "sfc",
"param" : ["2t", "cp", "msl"],
})
model_2 = "model=2" / ("frequency=continuous" / model_2)
```
```{code-cell} python3
model_3 = Qube.from_datacube({
"levtype": "pl",
"param" : ["q", "t"],
"level" : [100, 200, 300, 400, 50, 850, 500, 150, 600, 250, 700, 925, 1000],
}) | Qube.from_datacube({
"levtype": "sfc",
"param" : ["2t", "cp", "msl"],
})
model_3 = "model=3" / ("frequency=6h" / model_3)
model_3
```
Now we can combine the three models into a single qube:
```{code-cell} python3
all_models = model_1 | model_2 | model_3
all_models
```
Now we can perform queries over the models. We can get all models that produce 2m temperature:
```{code-cell} python3
all_models.select({
"param" : "2t",
})
```
Filter on both parameter and frequency:
```{code-cell} python3
all_models.select({
"param" : "2t",
"frequency": "continuous",
})
```
Find all models that have some overlap with this set of parameters:
```{code-cell} python3
all_models.select({
"param" : ["q", "t", "u", "v"],
})
```
## Choosing a set of models based on the requested parameter set
```{code-cell} python3
all_models.select({
"param" : ["q", "t", "u", "v"],
"frequency": "6h",
})
```
## Using WildCards
```{code-cell} python3
daily_surface_means = Qube.from_datacube({
"model": "*",
"frequency": "*",
"levtype": "sfc",
"param": "*",
})
all_models & daily_surface_means
```
```{code-cell} python3
daily_level_means = Qube.from_datacube({
"model": "*",
"frequency": "*",
"levtype": "pl",
"param": "*",
"level": "*"
})
all_models & daily_level_means
```
```{code-cell} python3
daily_level_mean_products = all_models & daily_surface_means
for i, identifier in enumerate(daily_level_mean_products.leaves()):
print(identifier)
if i > 10:
print("...")
break
```
<!-- ## Choosing the fewest models needed to cover the requested parameter set -->
<!-- ```{code-cell} python3 -->

53
docs/index.md Normal file
View File

@ -0,0 +1,53 @@
---
jupytext:
text_representation:
extension: .md
format_name: myst
format_version: 0.13
jupytext_version: 1.16.4
---
# Qubed
```{toctree}
:maxdepth: 1
quickstart.md
development.md
background.md
algorithms.md
fiab.md
cmd.md
```
Qubed provides a datastructure primitive for working with trees of DataCubes. If a normal tree looks like this:
```
root
├── class=od
│ ├── expver=0001
│ │ ├── param=1
│ │ └── param=2
│ └── expver=0002
│ ├── param=1
│ └── param=2
└── class=rd
├── expver=0001
│ ├── param=1
│ ├── param=2
│ └── param=3
└── expver=0002
├── param=1
└── param=2
```
A compressed view of the same set would be:
```
root
├── class=od, expver=0001/0002, param=1/2
└── class=rd
├── expver=0001, param=1/2/3
└── expver=0002, param=1/2
```
Qubed provides a datastructure that represents this compressed cube we call a Qube. It defines all the algorithms you would expect such as intersection/union/difference, compression, search, transformation and filtering.
To get a little more background on the motivation and structure of a Qube go to [Background](background.md), for a more hands on intro, go to [Quickstart](quickstart.md).

35
docs/make.bat Normal file
View File

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
if "%1" == "" goto help
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

273
docs/quickstart.md Normal file
View File

@ -0,0 +1,273 @@
---
jupytext:
text_representation:
extension: .md
format_name: myst
format_version: 0.13
jupytext_version: 1.16.4
---
# Quickstart
First install qubed with `pip install qubed`. Now, let's dive in with a real world dataset from the [Climate DT](https://destine.ecmwf.int/climate-change-adaptation-digital-twin-climate-dt/). We'll pull a prebuilt qube from github and render it in it's default HTML representation.
```{code-cell} python3
import requests
from qubed import Qube
climate_dt = Qube.from_json(requests.get("https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json").json())
climate_dt.html(depth=1)
```
Click the arrows to expand and drill down deeper into the data.
```{note}
There is currently a simple Qube web browser hosted [here](https://qubed.lumi.apps.dte.destination-earth.eu/). Browse that and copy the 'Example Qube Code' to download a Qube representing the selection at that point. You'll get something like `Qube.from_json(requests.get("https://qubed.lumi.apps.dte.destination-earth.eu/api/v1/select/climate-dt/?").json())`{l=python}
```
Fundamentally a Qube represents a set identifiers which are a set of key value pairs, here's the one leaf in the Climate DT dataset:
```{code-cell} python3
next(climate_dt.leaves())
```
We can look at the set of values each key can take:
```{code-cell} python3
axes = climate_dt.axes()
for key, values in axes.items():
print(f"{key} : {list(sorted(values))[:10]}")
```
This dataset isn't dense, you can't choose any combination of the above key values pairs, but it does contain many dense datacubes. Hence it makes sense to store and process the set as a tree of dense datacubes, which is what a Qube. For a sense of scale, this dataset contains about 8 million distinct datasets but only contains a few hundred unique nodes.
```{code-cell} python3
import objsize
print(f"""
Distinct datasets: {climate_dt.n_leaves}
Number of nodes in the tree: {climate_dt.n_nodes}
Number of dense datacubes within this qube: {len(list(climate_dt.datacubes()))}
In memory size according to objsize: {objsize.get_deep_size(climate_dt) / 2**20:.0f} MB
""")
```
## Building your own Qubes
You can do it from nested dictionaries with keys in the form "{key=value}":
```{code-cell} python3
from qubed import Qube
q1 = Qube.from_dict({
"class=od" : {
"expver=0001": {"param=1":{}, "param=2":{}},
"expver=0002": {"param=1":{}, "param=2":{}},
},
"class=rd" : {
"expver=0001": {"param=1":{}, "param=2":{}, "param=3":{}},
"expver=0002": {"param=1":{}, "param=2":{}},
},
})
print(f"{q1.n_leaves = }, {q1.n_nodes = }")
q1
```
If someone sends you a printed qube you can convert that back to a Qube too:
```{code-cell} python3
q2 = Qube.from_tree("""
root, frequency=6:00:00
├── levtype=pl, param=t, levelist=850, threshold=-2/-4/-8/2/4/8
└── levtype=sfc
├── param=10u/10v, threshold=10/15
├── param=2t, threshold=273.15
└── param=tp, threshold=0.1/1/10/100/20/25/5/50
""")
q2
```
We would not recommend trying to write this representation by hand though.
Finally, quite a flexible approach is to take the union of a series of dense datacubes:
```{code-cell} python3
q3 = Qube.from_datacube(
dict(
param="10u/10v/2d/2t/cp/msl/skt/sp/tcw/tp".split("/"),
threshold="*",
levtype="sfc",
frequency="6:00:00",
)
) | Qube.from_datacube(
dict(
param="q/t/u/v/w/z".split("/"),
threshold="*",
levtype="pl",
level="50/100/150/200/250/300/400/500/600/700/850".split("/"),
frequency="6:00:00",
)
)
q3
```
## Operations on Qubes
Going back to that first qube:
```{code-cell} python3
q1
```
We can compress it:
```{code-cell} python3
cq = q1.compress()
assert cq.n_leaves == q1.n_leaves
print(f"{cq.n_leaves = }, {cq.n_nodes = }")
cq
```
With the HTML representation you can click on the leaves to expand them. You can copy a path representation of a node to the clipboard by alt/option/⌥ clicking on it. You can then extract that node in code using `[]`:
```{code-cell} python3
cq["class=rd,expver=0001"]
```
Select a subtree:
```{code-cell} python3
cq["class", "od"]["expver", "0001"]
```
Intersect with a dense datacube:
```{code-cell} python3
dq = Qube.from_datacube({
"class": ["od", "rd", "cd"],
"expver": ["0001", "0002", "0003"],
"param": "2",
})
(cq & dq).print()
```
## Iteration
Iterate over the leaves:
```{code-cell} python3
for i, identifier in enumerate(cq.leaves()):
print(identifier)
if i > 10:
print("...")
break
```
Or if you can it's more efficient to iterate over the datacubes:
```{code-cell} python3
list(cq.datacubes())
```
## Selection
Select a subset of the tree:
```{code-cell} python3
climate_dt.select({
"activity": "scenariomip"
}).html(depth=1)
```
Use `.span("key")` to get the set of possibles values for a key, note this includes anywhere this key appears in the tree.
```{code-cell} python3
climate_dt.span("activity")
```
Use `.axes()` to get the span of every key in one go.
```{code-cell} python3
axes = climate_dt.axes()
for key, values in axes.items():
print(f"{key} : {list(values)[:10]}")
```
## Set Operations
The union/intersection/difference of two dense datacubes is not itself dense.
```{code-cell} python3
A = Qube.from_dict({"a=1/2/3" : {"b=i/j/k" : {}},})
B = Qube.from_dict({"a=2/3/4" : {"b=j/k/l" : {}},})
A.print(), B.print();
```
Union:
```{code-cell} python3
(A | B).print();
```
Intersection:
```{code-cell} python3
(A & B).print();
```
Difference:
```{code-cell} python3
(A - B).print();
```
Symmetric Difference:
```{code-cell} python3
(A ^ B).print();
```
## Transformations
`q.transform` takes a python function from one node to one or more nodes and uses this to build a new tree. This can be used for simple operations on the key or values but also to split or remove nodes. Note that you can't use it to merge nodes beause it's only allowed to see one node at a time.
```{code-cell} python3
def capitalize(node): return node.replace(key = node.key.capitalize())
climate_dt.transform(capitalize).html(depth=1)
```
## Save to disk
There is currently a very simple JSON serialisation format. More compact binary serialisations are planned.
```{code-cell} python3
json = climate_dt.to_json()
Qube.from_json(json) == climate_dt
```
## Advanced Selection
There is currently partial support for different datatypes in addition to strings. Here we can convert datatypes by key to ints and timedeltas and then use functions as filters in select.
```{code-cell} python3
from datetime import timedelta, datetime
def to_timedelta(t):
dt = datetime.strptime(t, "%H:%M:%S")
return timedelta(hours=dt.hour, minutes=dt.minute, seconds=dt.second)
q = Qube.from_tree("""
root, frequency=6:00:00
├── levtype=pl, levelist=850, threshold=-2/-4/-8/2/4/8
└── levtype=sfc
├── param=10u/10v, threshold=10/15
├── param=2t, threshold=273.15
└── param=tp, threshold=0.1/1/10/100/20/25/5/50
""").convert_dtypes({
"threshold": float,
"levelist": int,
"frequency": to_timedelta,
})
r = q.select({
"threshold": lambda t: t > 5,
"frequency": lambda dt: dt > timedelta(hours = 2),
})
r
```

3
docs/requirements.txt Normal file
View File

@ -0,0 +1,3 @@
numpy
scipy
objsize

1
fiab/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
!*.json

37
fiab/example_products.md Normal file
View File

@ -0,0 +1,37 @@
Simplest possible product
- one field: 2 metre temperature
- all models that output param=2t would work
- may also have a lead time range specified from
So we could say "here are all the models with param=2t with lead times in the specified interval"
quantiles
param:
float range from 0 - 100
threshold:
"give me 2 metre temperature values that are above this threshold"
product requrements can be specified as a set of:
params: one or more params
levels: one or more or all
time:
- product could be specific to a particular time
- could require at least a months worth of data
make some fake models that have:
- fewer params
- continous times vs steps of 6 hours
-
Could also represent what data is currently cached on disk and be able to then tell the use what stuff they can generate really fast.
API want:
- way to get axis span like what params exist
-

48
fiab/extract.py Normal file
View File

@ -0,0 +1,48 @@
import json
from collections import defaultdict
from qubed import Qube
metadata = json.load(open("raw_anemoi_metadata.json"))
predicted_indices = [
*metadata["data_indices"]["data"]["output"]["prognostic"],
*metadata["data_indices"]["data"]["output"]["diagnostic"],
]
variables = metadata["dataset"]["variables"]
variables = [variables[i] for i in predicted_indices]
# print('Raw Model Variables:', variables)
# Split variables between pressure and surface
surface_variables = [v for v in variables if "_" not in v]
# Collect the levels for each pressure variable
level_variables = defaultdict(list)
for v in variables:
if "_" in v:
variable, level = v.split("_")
level_variables[variable].append(int(level))
# print(level_variables)
model_tree = Qube.empty()
for variable, levels in level_variables.items():
model_tree = model_tree | Qube.from_datacube(
{
"levtype": "pl",
"param": variable,
"level": levels,
}
)
for variable in surface_variables:
model_tree = model_tree | Qube.from_datacube(
{
"levtype": "sfc",
"param": variable,
}
)
print(model_tree.to_json())

File diff suppressed because one or more lines are too long

67
fiab/structure.yaml Normal file
View File

@ -0,0 +1,67 @@
# Format: list of models, each model has a model_outputs field which contains a nested tree of nodes
# Nodes have {node: name, cube: list of key value(s) pairs, children: list[nodes]}
- model: surface_and_atmosphere_model
model_outputs:
- node: root
cube:
class: rd
stream: anemoi
expver: something
lead_time:
type: datetime
format: '%Y-%m-%d %H:%M:%S'
step: 6h
children:
- node: pressure_variables
other_metadata: something
cube:
param: ['q', 't', 'u', 'v', 'w', 'z']
level: [50, 100, 150, 200, 250, 300, 400, 500, 600, 700, 850, 925, 1000]
- node: surface_variables
other_metadata: something
cube:
param: ['sp', 'msl', '10u', '10v', '2t', '2d', 'skt', 'tcw', 'cp', 'tp']
# Hypothetical Ocean variables
- node: ocean_variables
cube:
param: ["saltiness", "number of eels", "is_blue", "good_for_surfing"]
ocean_levels: [??, ??]
# Alternative List of cubes format
- model: surface_and_atmosphere_model
model_outputs:
- node: root
cube:
class: rd
stream: anemoi
expver: something
lead_time:
type: datetime
format: '%Y-%m-%d %H:%M:%S'
step: 6h
children:
- node: pressure_variables
other_metadata: something
cube:
param: ['q', 't', 'u', 'v', 'w', 'z']
level: [50, 100, 150, 200, 250, 300, 400, 500, 600, 700, 850, 925, 1000]
- node: surface_variables
other_metadata: something
cube:
param: ['sp', 'msl', '10u', '10v', '2t', '2d', 'skt', 'tcw', 'cp', 'tp']
# Hypothetical Ocean variables
- node: ocean_variables
cube:
param: ["saltiness", "number of eels", "is_blue", "good_for_surfing"]
ocean_levels: [??, ??]

67
pyproject.toml Normal file
View File

@ -0,0 +1,67 @@
[build-system]
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"
[project]
name = "qubed"
description = "A library that provides a tree of datacubes called Qube."
readme = "README.md"
authors = [
{name = "Tom Hodson", email = "thomas.hodson@ecmwf.int"},
]
classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
requires-python = ">= 3.11"
dynamic = ["version"]
dependencies = [
"frozendict",
"numpy",
"protobuf",
# CLI
"rich",
"click",
"psutil"
]
# Because this is a mixed rust/python project the structure is src/python/qubed rather than the more typical src/qubed
# Since this is slightly non-standard so we have to explicitly tell setuptools the python source is there.
[tool.setuptools.packages.find]
where = ["src/python"]
[project.scripts]
qubed = "qubed.__main__:main"
[tool.maturin]
python-source = "src/python"
module-name = "qubed.rust"
features = ["pyo3/extension-module"]
[project.optional-dependencies]
stac_server = [
"fastapi",
]
docs = [
"sphinx",
"sphinx-rtd-theme",
"myst_nb",
"sphinx-autobuild",
"jupyterlab",
"ipykernel",
]
dev = [
"pytest",
"black",
"ruff",
"flake8",
"pre-commit",
"isort",
]

2
run.sh
View File

@ -4,4 +4,4 @@ cd backend
--reload-include="*.html" \
--reload-include="*.css" \
--reload-include="*.js" \
--reload-include="*.yaml"
--reload-include="*.yaml"

View File

@ -1,7 +1,7 @@
cd backend
# sudo ../.venv/bin/fastapi dev main.py --port 80
# sudo ../.venv/bin/fastapi dev main.py --port 80
sudo ../.venv/bin/uvicorn main:app --port 80 --host 0.0.0.0 --reload\
--reload-include="*.html" \
--reload-include="*.css" \
--reload-include="*.js" \
--reload-include="*.yaml"
--reload-include="*.yaml"

View File

@ -2,12 +2,8 @@ set -e
sudo docker login eccr.ecmwf.int
# Uses ssh agent to check out private repos
# Make sure that ssh agent is running, your key is added
# and potentially that you're using ssh-forwarding if building on a remote machine
sudo DOCKER_BUILDKIT=1 docker build \
--ssh default=${SSH_AUTH_SOCK} \
sudo docker build \
--tag=eccr.ecmwf.int/qubed/stac_server:latest \
--target=stac_server \
.
sudo docker --debug push eccr.ecmwf.int/qubed/stac_server:latest
sudo docker push eccr.ecmwf.int/qubed/stac_server:latest

View File

@ -1 +1,2 @@
helm upgrade stac-server chart -n stac-server
# helm install qubed chart -n qubed
helm upgrade qubed chart -n qubed

1
scripts/logs.sh Normal file
View File

@ -0,0 +1 @@
kubectl -n qubed logs deployment/stac-server

2
scripts/restart.sh Executable file
View File

@ -0,0 +1,2 @@
# kubectl rollout restart deployment/redis
kubectl -n qubed rollout restart deployment/stac-server

692
src/python/qubed/Qube.py Normal file
View File

@ -0,0 +1,692 @@
# This causes python types to be evaluated later,
# allowing you to reference types like Qube inside the definion of the Qube class
# without having to do "Qube"
from __future__ import annotations
import dataclasses
import functools
import json
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass, field
from functools import cached_property
from pathlib import Path
from typing import Any, Iterable, Iterator, Literal, Mapping, Self, Sequence
import numpy as np
from frozendict import frozendict
from . import set_operations
from .metadata import from_nodes
from .protobuf.adapters import proto_to_qube, qube_to_proto
from .tree_formatters import (
HTML,
_display,
node_tree_to_html,
node_tree_to_string,
)
from .value_types import (
QEnum,
ValueGroup,
WildcardGroup,
values_from_json,
)
@dataclass
class AxisInfo:
key: str
type: Any
depths: set[int]
values: set
def combine(self, other: Self):
self.key = other.key
self.type = other.type
self.depths.update(other.depths)
self.values.update(other.values)
# print(f"combining {self} and {other} getting {result}")
def to_json(self):
return {
"key": self.key,
"type": self.type.__name__,
"values": list(self.values),
"depths": list(self.depths),
}
@dataclass(frozen=True, eq=True, order=True, unsafe_hash=True)
class QubeNamedRoot:
"Helper class to print a custom root name"
key: str
children: tuple[Qube, ...] = ()
def summary(self) -> str:
return self.key
@dataclass(frozen=False, eq=True, order=True, unsafe_hash=True)
class Qube:
key: str
values: ValueGroup
metadata: frozendict[str, np.ndarray] = field(
default_factory=lambda: frozendict({}), compare=False
)
children: tuple[Qube, ...] = ()
is_root: bool = False
is_leaf: bool = False
depth: int = field(default=0, compare=False)
shape: tuple[int, ...] = field(default=(), compare=False)
@classmethod
def make_node(
cls,
key: str,
values: Iterable | QEnum | WildcardGroup,
children: Iterable[Qube],
metadata: Mapping[str, np.ndarray] = {},
is_root: bool = False,
is_leaf: bool | None = None,
) -> Qube:
if isinstance(values, ValueGroup):
values = values
else:
values = QEnum(values)
if not isinstance(values, WildcardGroup) and not is_root:
assert len(values) > 0, "Nodes must have at least one value"
children = tuple(sorted(children, key=lambda n: ((n.key, n.values.min()))))
return cls(
key,
values=values,
children=children,
metadata=frozendict(metadata),
is_root=is_root,
is_leaf=(not len(children)) if is_leaf is None else is_leaf,
)
@classmethod
def make_root(cls, children: Iterable[Qube], metadata={}) -> Qube:
def update_depth_shape(children, depth, shape):
for child in children:
child.depth = depth + 1
child.shape = shape + (len(child.values),)
update_depth_shape(child.children, child.depth, child.shape)
update_depth_shape(children, depth=0, shape=(1,))
return cls.make_node(
"root",
values=QEnum(("root",)),
children=children,
metadata=metadata,
is_root=True,
)
def replace(self, **kwargs) -> Qube:
return dataclasses.replace(self, **kwargs)
def summary(self) -> str:
if self.is_root:
return self.key
return f"{self.key}={self.values.summary()}" if self.key != "root" else "root"
@classmethod
def load(cls, path: str | Path) -> Qube:
with open(path, "r") as f:
return Qube.from_json(json.load(f))
@classmethod
def from_datacube(cls, datacube: Mapping[str, str | Sequence[str]]) -> Qube:
key_vals = list(datacube.items())[::-1]
children: list[Qube] = []
for key, values in key_vals:
values_group: ValueGroup
if values == "*":
values_group = WildcardGroup()
elif isinstance(values, list):
values_group = QEnum(values)
else:
values_group = QEnum([values])
children = [cls.make_node(key, values_group, children)]
return cls.make_root(children)
@classmethod
def from_json(cls, json: dict) -> Qube:
def from_json(json: dict, depth=0) -> Qube:
return Qube.make_node(
key=json["key"],
values=values_from_json(json["values"]),
metadata=frozendict(json["metadata"]) if "metadata" in json else {},
children=(from_json(c, depth + 1) for c in json["children"]),
is_root=(depth == 0),
)
return from_json(json)
@classmethod
def from_nodes(cls, nodes: dict[str, dict], add_root: bool = True):
return from_nodes(cls, nodes, add_root)
def to_json(self) -> dict:
def to_json(node: Qube) -> dict:
return {
"key": node.key,
"values": node.values.to_json(),
"metadata": dict(node.metadata),
"children": [to_json(c) for c in node.children],
}
return to_json(self)
@classmethod
def from_dict(cls, d: dict) -> Qube:
def from_dict(d: dict) -> Iterator[Qube]:
for k, children in d.items():
key, values = k.split("=")
values = values.split("/")
# children == {"..." : {}}
# is a special case to represent trees with leaves we don't know about
if frozendict(children) == frozendict({"...": {}}):
yield Qube.make_node(
key=key,
values=values,
children={},
is_leaf=False,
)
# Special case for Wildcard values
if values == ["*"]:
values = WildcardGroup()
else:
values = QEnum(values)
yield Qube.make_node(
key=key,
values=values,
children=from_dict(children),
)
return Qube.make_root(list(from_dict(d)))
def to_dict(self) -> dict:
def to_dict(q: Qube) -> tuple[str, dict]:
key = f"{q.key}={','.join(str(v) for v in q.values)}"
return key, dict(to_dict(c) for c in q.children)
return to_dict(self)[1]
@classmethod
def from_protobuf(cls, msg: bytes) -> Qube:
return proto_to_qube(cls, msg)
def to_protobuf(self) -> bytes:
return qube_to_proto(self)
@classmethod
def from_tree(cls, tree_str):
lines = tree_str.splitlines()
stack = []
root = {}
initial_indent = None
for line in lines:
if not line.strip():
continue
# Remove tree characters and measure indent level
stripped = line.lstrip(" │├└─")
indent = (len(line) - len(stripped)) // 4
if initial_indent is None:
initial_indent = indent
indent = indent - initial_indent
# Split multiple key=value parts into nested structure
keys = [item.strip() for item in stripped.split(",")]
current = bottom = {}
for key in reversed(keys):
current = {key: current}
# Adjust the stack to current indent level
# print(len(stack), stack)
while len(stack) > indent:
stack.pop()
if stack:
# Add to the dictionary at current stack level
parent = stack[-1]
key = list(current.keys())[0]
if key in parent:
raise ValueError(
f"This function doesn't yet support reading in uncompressed trees, repeated key is {key}"
)
parent[key] = current[key]
else:
# Top level
key = list(current.keys())[0]
if root:
raise ValueError(
f"This function doesn't yet support reading in uncompressed trees, repeated key is {key}"
)
root = current[key]
# Push to the stack
stack.append(bottom)
return cls.from_dict(root)
@classmethod
def empty(cls) -> Qube:
return Qube.make_root([])
def __str_helper__(self, depth=None, name=None) -> str:
node = self
if name is not None:
node = node.replace(key=name)
out = "".join(node_tree_to_string(node=node, depth=depth))
if out[-1] == "\n":
out = out[:-1]
return out
def __str__(self):
return self.__str_helper__()
def __repr__(self):
return f"Qube({self.__str_helper__()})"
def print(self, depth=None, name: str | None = None):
print(self.__str_helper__(depth=depth, name=name))
def html(
self,
depth=2,
collapse=True,
name: str | None = None,
info: Callable[[Qube], str] | None = None,
) -> HTML:
node = self
if name is not None:
node = node.replace(key=name)
return HTML(
node_tree_to_html(node=node, depth=depth, collapse=collapse, info=info)
)
def _repr_html_(self) -> str:
return node_tree_to_html(self, depth=2, collapse=True)
# Allow "key=value/value" / qube to prepend keys
def __rtruediv__(self, other: str) -> Qube:
key, values = other.split("=")
values_enum = QEnum((values.split("/")))
return Qube.make_root([Qube.make_node(key, values_enum, self.children)])
def __or__(self, other: Qube) -> Qube:
return set_operations.operation(
self, other, set_operations.SetOperation.UNION, type(self)
)
def __and__(self, other: Qube) -> Qube:
return set_operations.operation(
self, other, set_operations.SetOperation.INTERSECTION, type(self)
)
def __sub__(self, other: Qube) -> Qube:
return set_operations.operation(
self, other, set_operations.SetOperation.DIFFERENCE, type(self)
)
def __xor__(self, other: Qube) -> Qube:
return set_operations.operation(
self, other, set_operations.SetOperation.SYMMETRIC_DIFFERENCE, type(self)
)
def leaves(self) -> Iterable[dict[str, str]]:
for value in self.values:
if not self.children:
yield {self.key: value}
for child in self.children:
for leaf in child.leaves():
if self.key != "root":
yield {self.key: value, **leaf}
else:
yield leaf
def leaf_nodes(self) -> "Iterable[tuple[dict[str, str], Qube]]":
for value in self.values:
if not self.children:
yield ({self.key: value}, self)
for child in self.children:
for leaf in child.leaf_nodes():
if self.key != "root":
yield ({self.key: value, **leaf[0]}, leaf[1])
else:
yield leaf
def leaves_with_metadata(
self, indices=()
) -> Iterator[tuple[dict[str, str], dict[str, str | np.ndarray]]]:
if self.key == "root":
for c in self.children:
yield from c.leaves_with_metadata(indices=())
return
for index, value in enumerate(self.values):
indexed_metadata = {
k: vs[indices + (index,)] for k, vs in self.metadata.items()
}
indexed_metadata = {
k: v.item() if v.shape == () else v for k, v in indexed_metadata.items()
}
if not self.children:
yield {self.key: value}, indexed_metadata
for child in self.children:
for leaf, metadata in child.leaves_with_metadata(
indices=indices + (index,)
):
if self.key != "root":
yield {self.key: value, **leaf}, metadata | indexed_metadata
else:
yield leaf, metadata
def datacubes(self) -> Iterable[dict[str, Any | list[Any]]]:
def to_list_of_cubes(node: Qube) -> Iterable[dict[str, Any | list[Any]]]:
if node.key == "root":
for c in node.children:
yield from to_list_of_cubes(c)
else:
if not node.children:
yield {node.key: list(node.values)}
for c in node.children:
for sub_cube in to_list_of_cubes(c):
yield {node.key: list(node.values)} | sub_cube
return to_list_of_cubes(self)
def __getitem__(self, args) -> Qube:
if isinstance(args, str):
specifiers = args.split(",")
current = self
for specifier in specifiers:
key, values_str = specifier.split("=")
values = values_str.split("/")
for c in current.children:
if c.key == key and set(values) == set(c.values):
current = c
break
else:
raise KeyError(
f"Key '{key}' not found in children of '{current.key}', available keys are {[c.key for c in current.children]}"
)
return Qube.make_root(current.children)
elif isinstance(args, tuple) and len(args) == 2:
key, value = args
for c in self.children:
if c.key == key and value in c.values:
return Qube.make_root(c.children)
raise KeyError(f"Key '{key}' not found in children of {self.key}")
else:
raise ValueError(f"Unknown key type {args}")
@cached_property
def n_leaves(self) -> int:
# This line makes the equation q.n_leaves + r.n_leaves == (q | r).n_leaves true is q and r have no overlap
if self.key == "root" and not self.children:
return 0
return len(self.values) * (
sum(c.n_leaves for c in self.children) if self.children else 1
)
@cached_property
def n_nodes(self) -> int:
if self.key == "root" and not self.children:
return 0
return 1 + sum(c.n_nodes for c in self.children)
def transform(self, func: "Callable[[Qube], Qube | Iterable[Qube]]") -> Qube:
"""
Call a function on every node of the Qube, return one or more nodes.
If multiple nodes are returned they each get a copy of the (transformed) children of the original node.
Any changes to the children of a node will be ignored.
"""
def transform(node: Qube) -> list[Qube]:
children = tuple(sorted(cc for c in node.children for cc in transform(c)))
new_nodes = func(node)
if isinstance(new_nodes, Qube):
new_nodes = [new_nodes]
return [new_node.replace(children=children) for new_node in new_nodes]
children = tuple(cc for c in self.children for cc in transform(c))
return self.replace(children=children)
def remove_by_key(self, keys: str | list[str]):
_keys: list[str] = keys if isinstance(keys, list) else [keys]
def remove_key(node: Qube) -> Qube:
children: list[Qube] = []
for c in node.children:
if c.key in _keys:
grandchildren = tuple(sorted(remove_key(cc) for cc in c.children))
grandchildren = remove_key(Qube.make_root(grandchildren)).children
children.extend(grandchildren)
else:
children.append(remove_key(c))
return node.replace(children=tuple(sorted(children)))
return remove_key(self).compress()
def convert_dtypes(self, converters: dict[str, Callable[[Any], Any]]):
def convert(node: Qube) -> Qube:
if node.key in converters:
converter = converters[node.key]
values = [converter(v) for v in node.values]
new_node = node.replace(values=QEnum(values))
return new_node
return node
return self.transform(convert)
def select(
self,
selection: dict[str, str | list[str] | Callable[[Any], bool]],
mode: Literal["strict", "relaxed"] = "relaxed",
consume=False,
) -> Qube:
# Find any bare str values and replace them with [str]
_selection: dict[str, list[str] | Callable[[Any], bool]] = {}
for k, v in selection.items():
if isinstance(v, list):
_selection[k] = v
elif callable(v):
_selection[k] = v
else:
_selection[k] = [v]
def not_none(xs):
return tuple(x for x in xs if x is not None)
def select(
node: Qube,
selection: dict[str, list[str] | Callable[[Any], bool]],
matched: bool,
) -> Qube | None:
# If this node has no children but there are still parts of the request
# that have not been consumed, then prune this whole branch
if consume and not node.children and selection:
return None
# If the key isn't in the selection then what we do depends on the mode:
# In strict mode we just stop here
# In next_level mode we include the next level down so you can tell what keys to add next
# In relaxed mode we skip the key if it't not in the request and carry on
if node.key not in selection:
if mode == "strict":
return None
elif mode == "next_level":
return node.replace(
children=(),
metadata=self.metadata
| {"is_leaf": np.array([not bool(node.children)])},
)
elif mode == "relaxed":
pass
else:
raise ValueError(f"Unknown mode argument {mode}")
# If the key IS in the selection then check if the values match
if node.key in _selection:
# If the key is specified, check if any of the values match
selection_criteria = _selection[node.key]
if callable(selection_criteria):
values = QEnum((c for c in node.values if selection_criteria(c)))
elif isinstance(selection_criteria, list):
values = QEnum((c for c in selection_criteria if c in node.values))
else:
raise ValueError(f"Unknown selection type {selection_criteria}")
# Here modes don't matter because we've explicitly filtered on this key and found nothing
if not values:
return None
matched = True
node = node.replace(values=values)
if consume:
selection = {k: v for k, v in selection.items() if k != node.key}
# Prune nodes that had had all their children pruned
new_children = not_none(
select(c, selection, matched) for c in node.children
)
if node.children and not new_children:
return None
metadata = dict(node.metadata)
if mode == "next_level":
metadata["is_leaf"] = np.array([not bool(node.children)])
return node.replace(
children=new_children,
metadata=metadata,
)
return self.replace(
children=not_none(
select(c, _selection, matched=False) for c in self.children
)
)
def span(self, key: str) -> list[str]:
"""
Search the whole tree for any value that a given key takes anywhere.
"""
this = set(self.values) if self.key == key else set()
return sorted(this | set(v for c in self.children for v in c.span(key)))
def axes(self) -> dict[str, set[str]]:
"""
Return a dictionary of all the spans of the keys in the qube.
"""
axes = defaultdict(set)
for c in self.children:
for k, v in c.axes().items():
axes[k].update(v)
if self.key != "root":
axes[self.key].update(self.values)
return dict(axes)
def axes_info(self, depth=0) -> dict[str, AxisInfo]:
axes = defaultdict(
lambda: AxisInfo(key="", type=str, depths=set(), values=set())
)
for c in self.children:
for k, info in c.axes_info(depth=depth + 1).items():
axes[k].combine(info)
if self.key != "root":
axes[self.key].combine(
AxisInfo(
key=self.key,
type=type(next(iter(self.values))),
depths={depth},
values=set(self.values),
)
)
return dict(axes)
@cached_property
def structural_hash(self) -> int:
"""
This hash takes into account the key, values and children's key values recursively.
Because nodes are immutable, we only need to compute this once.
"""
def hash_node(node: Qube) -> int:
return hash(
(node.key, node.values, tuple(c.structural_hash for c in node.children))
)
return hash_node(self)
def compress(self) -> Qube:
"""
This method is quite computationally heavy because of trees like this:
root, class=d1, generation=1
time=0600, many identical keys, param=8,78,79
time=0600, many identical keys, param=8,78,79
time=0600, many identical keys, param=8,78,79
This tree compresses dow n
"""
def union(a: Qube, b: Qube) -> Qube:
b = type(self).make_root(children=(b,))
out = set_operations.operation(
a, b, set_operations.SetOperation.UNION, type(self)
)
return out
new_children = [c.compress() for c in self.children]
if len(new_children) > 1:
new_children = list(
functools.reduce(union, new_children, Qube.empty()).children
)
return self.replace(children=tuple(sorted(new_children)))
def add_metadata(self, **kwargs: dict[str, Any]):
metadata = {
k: np.array(
[
v,
]
)
for k, v in kwargs.items()
}
return self.replace(metadata=metadata)
def strip_metadata(self) -> Qube:
def strip(node):
return node.replace(metadata=frozendict({}))
return self.transform(strip)
def display(self):
_display(self)

View File

@ -0,0 +1,4 @@
from . import protobuf
from .Qube import Qube
__all__ = ["Qube", "protobuf"]

View File

@ -0,0 +1,124 @@
import json
import time
import click
import psutil
from rich.console import Console
from rich.layout import Layout
from rich.live import Live
from rich.panel import Panel
from rich.spinner import Spinner
from rich.text import Text
from qubed import Qube
from qubed.convert import parse_fdb_list
console = Console(stderr=True)
process = psutil.Process()
PRINT_INTERVAL = 0.25
@click.group()
def main():
"""Command-line tool for working with trees."""
pass
@main.command()
@click.option(
"--input",
type=click.File("r"),
default="-",
help="Specify the input file (default: standard input).",
)
@click.option(
"--output",
type=click.File("w"),
default="-",
help="Specify the output file (default: standard output).",
)
@click.option(
"--from",
"from_format",
type=click.Choice(["fdb", "mars"]),
default="fdb",
help="Specify the input format: fdb (fdb list --porcelain) or mars (mars list).",
)
@click.option(
"--to",
"to_format",
type=click.Choice(["text", "html", "json"]),
default="text",
help="Specify the output format: text, html, json.",
)
def convert(input, output, from_format, to_format):
"""Convert trees from one format to another."""
q = Qube.empty()
t = time.time()
i0 = 0
n0 = 0
depth = 5
log = Text()
summary = Layout()
summary.split_column(
Layout(name="upper"),
Layout(name="qube"),
)
summary["upper"].split_row(
Layout(name="performance"),
Layout(log, name="log"),
)
spinner = Spinner("aesthetic", text="Performance", speed=0.3)
with Live(summary, auto_refresh=False, transient=True, console=console) as live:
for i, datacube in enumerate(parse_fdb_list(input)):
new_branch = Qube.from_datacube(datacube)
q = q | new_branch
if time.time() - t > PRINT_INTERVAL:
tree = q.__str__(depth=depth)
if tree.count("\n") > 20:
depth -= 1
if tree.count("\n") < 5:
depth += 1
summary["performance"].update(
Panel(
Text.assemble(
f"The Qube has {q.n_leaves} leaves and {q.n_nodes} internal nodes so far.\n",
f"{(i - i0) / (time.time() - t) / PRINT_INTERVAL:.0f} lines per second. ",
f"{(q.n_leaves - n0) / (time.time() - t):.0f} leaves per second.\n",
f"Memory usage: {process.memory_info().rss / 1024 / 1024:.0f} MB\n",
),
title=spinner.render(time.time()),
border_style="blue",
)
)
summary["qube"].update(
Panel(tree, title=f"Qube (depth {depth})", border_style="blue")
)
summary["log"].update(
Panel(
f"{datacube}", border_style="blue", title="Last Datacube Added"
)
)
live.refresh()
i0 = i
n0 = q.n_leaves
t = time.time()
if to_format == "text":
output_content = str(q)
elif to_format == "json":
output_content = json.dumps(q.to_json())
elif to_format == "html":
output_content = q.html().html
else:
output_content = str(q)
output.write(output_content)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,29 @@
def parse_key_value_pairs(text: str):
result = {}
text = text.replace("}{", ",") # Replace segment separators
text = (
text.replace("{", "").replace("}", "").strip()
) # Remove leading/trailing braces
for segment in text.split(","):
if "=" not in segment:
print(segment)
key, values_str = segment.split(
"=", 1
) # Ensure split only happens at first "="
values = values_str.split("/")
result[key] = values
return result
def parse_fdb_list(f):
for line in f.readlines():
# Handle fdb list normal
if line.startswith("{"):
yield parse_key_value_pairs(line)
# handle fdb list --compact
if line.startswith("retrieve,") and not line.startswith("retrieve,\n"):
line = line[9:]
yield parse_key_value_pairs(line)

View File

@ -0,0 +1,43 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Iterator
import numpy as np
from .value_types import QEnum
if TYPE_CHECKING:
from .Qube import Qube
def make_node(
cls,
key: str,
values: Iterator,
shape: list[int],
children: tuple[Qube, ...],
metadata: dict[str, np.ndarray] | None = None,
):
return cls.make_node(
key=key,
values=QEnum(values),
metadata={k: np.array(v).reshape(shape) for k, v in metadata.items()}
if metadata is not None
else {},
children=children,
)
def from_nodes(cls, nodes, add_root=True):
shape = [len(n["values"]) for n in nodes.values()]
nodes = nodes.items()
*nodes, (key, info) = nodes
root = make_node(cls, shape=shape, children=(), key=key, **info)
for key, info in reversed(nodes):
shape.pop()
root = make_node(cls, shape=shape, children=(root,), key=key, **info)
if add_root:
return cls.make_root(children=(root,))
return root

View File

View File

@ -0,0 +1,109 @@
from __future__ import annotations
import warnings
from typing import TYPE_CHECKING
import numpy as np
from frozendict import frozendict
from ..value_types import QEnum
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Protobuf gencode version",
UserWarning,
"google.protobuf.runtime_version",
)
from . import qube_pb2
if TYPE_CHECKING:
from ..Qube import Qube
def _ndarray_to_proto(arr: np.ndarray) -> qube_pb2.NdArray:
"""np.ndarray → NdArray message"""
return qube_pb2.NdArray(
shape=list(arr.shape),
dtype=str(arr.dtype),
raw=arr.tobytes(order="C"),
)
def _ndarray_from_proto(msg: qube_pb2.NdArray) -> np.ndarray:
"""NdArray message → np.ndarray (immutable view)"""
return np.frombuffer(msg.raw, dtype=msg.dtype).reshape(tuple(msg.shape))
def _py_to_valuegroup(value: list[str] | np.ndarray) -> qube_pb2.ValueGroup:
"""Accept str-sequence *or* ndarray and return ValueGroup."""
vg = qube_pb2.ValueGroup()
if isinstance(value, np.ndarray):
vg.tensor.CopyFrom(_ndarray_to_proto(value))
else:
vg.s.items.extend(value)
return vg
def _valuegroup_to_py(vg: qube_pb2.ValueGroup) -> list[str] | np.ndarray:
"""ValueGroup → list[str] *or* ndarray"""
arm = vg.WhichOneof("payload")
if arm == "tensor":
return _ndarray_from_proto(vg.tensor)
return QEnum(vg.s.items)
def _py_to_metadatagroup(value: np.ndarray) -> qube_pb2.MetadataGroup:
"""Accept str-sequence *or* ndarray and return ValueGroup."""
vg = qube_pb2.MetadataGroup()
if not isinstance(value, np.ndarray):
value = np.array([value])
vg.tensor.CopyFrom(_ndarray_to_proto(value))
return vg
def _metadatagroup_to_py(vg: qube_pb2.MetadataGroup) -> np.ndarray:
"""ValueGroup → list[str] *or* ndarray"""
arm = vg.WhichOneof("payload")
if arm == "tensor":
return _ndarray_from_proto(vg.tensor)
raise ValueError(f"Unknown arm {arm}")
def _qube_to_proto(q: Qube) -> qube_pb2.Qube:
"""Frozen Qube dataclass → protobuf Qube message (new object)."""
return qube_pb2.Qube(
key=q.key,
values=_py_to_valuegroup(q.values),
metadata={k: _py_to_metadatagroup(v) for k, v in q.metadata.items()},
children=[_qube_to_proto(c) for c in q.children],
is_root=q.is_root,
)
def qube_to_proto(q: Qube) -> bytes:
return _qube_to_proto(q).SerializeToString()
def _proto_to_qube(cls: type, msg: qube_pb2.Qube) -> Qube:
"""protobuf Qube message → frozen Qube dataclass (new object)."""
return cls.make_node(
key=msg.key,
values=_valuegroup_to_py(msg.values),
metadata=frozendict(
{k: _metadatagroup_to_py(v) for k, v in msg.metadata.items()}
),
children=tuple(_proto_to_qube(cls, c) for c in msg.children),
is_root=msg.is_root,
)
def proto_to_qube(cls: type, wire: bytes) -> Qube:
msg = qube_pb2.Qube()
msg.ParseFromString(wire)
return _proto_to_qube(cls, msg)

View File

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# NO CHECKED-IN PROTOBUF GENCODE
# source: qube.proto
# Protobuf Python Version: 5.29.0
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import runtime_version as _runtime_version
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
_runtime_version.ValidateProtobufRuntimeVersion(
_runtime_version.Domain.PUBLIC, 5, 29, 0, "", "qube.proto"
)
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\nqube.proto"4\n\x07NdArray\x12\r\n\x05shape\x18\x01 \x03(\x03\x12\r\n\x05\x64type\x18\x02 \x01(\t\x12\x0b\n\x03raw\x18\x03 \x01(\x0c"\x1c\n\x0bStringGroup\x12\r\n\x05items\x18\x01 \x03(\t"N\n\nValueGroup\x12\x19\n\x01s\x18\x01 \x01(\x0b\x32\x0c.StringGroupH\x00\x12\x1a\n\x06tensor\x18\x02 \x01(\x0b\x32\x08.NdArrayH\x00\x42\t\n\x07payload"6\n\rMetadataGroup\x12\x1a\n\x06tensor\x18\x01 \x01(\x0b\x32\x08.NdArrayH\x00\x42\t\n\x07payload"\xd1\x01\n\x04Qube\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1b\n\x06values\x18\x02 \x01(\x0b\x32\x0b.ValueGroup\x12%\n\x08metadata\x18\x03 \x03(\x0b\x32\x13.Qube.MetadataEntry\x12\r\n\x05\x64type\x18\x04 \x01(\t\x12\x17\n\x08\x63hildren\x18\x05 \x03(\x0b\x32\x05.Qube\x12\x0f\n\x07is_root\x18\x06 \x01(\x08\x1a?\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1d\n\x05value\x18\x02 \x01(\x0b\x32\x0e.MetadataGroup:\x02\x38\x01\x62\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "qube_pb2", _globals)
if not _descriptor._USE_C_DESCRIPTORS:
DESCRIPTOR._loaded_options = None
_globals["_QUBE_METADATAENTRY"]._loaded_options = None
_globals["_QUBE_METADATAENTRY"]._serialized_options = b"8\001"
_globals["_NDARRAY"]._serialized_start = 14
_globals["_NDARRAY"]._serialized_end = 66
_globals["_STRINGGROUP"]._serialized_start = 68
_globals["_STRINGGROUP"]._serialized_end = 96
_globals["_VALUEGROUP"]._serialized_start = 98
_globals["_VALUEGROUP"]._serialized_end = 176
_globals["_METADATAGROUP"]._serialized_start = 178
_globals["_METADATAGROUP"]._serialized_end = 232
_globals["_QUBE"]._serialized_start = 235
_globals["_QUBE"]._serialized_end = 444
_globals["_QUBE_METADATAENTRY"]._serialized_start = 381
_globals["_QUBE_METADATAENTRY"]._serialized_end = 444
# @@protoc_insertion_point(module_scope)

View File

View File

@ -0,0 +1,464 @@
"""
# Set Operations
The core of this is the observation that for two sets A and B, if we compute (A - B), (A B) amd (B - A)
then we can get the other operations by taking unions of the above three objects.
Union: All of them
Intersection: Just take A B
Difference: Take either A - B or B - A
Symmetric Difference (XOR): Take A - B and B - A
We start with a shallow implementation of this algorithm that only deals with a pair of nodes, not the whole tree:
shallow_set_operation(A: Qube, B: Qube) -> SetOpsResult
This takes two qubes and (morally) returns (A - B), (A B) amd (B - A) but only for the values and metadata at the top level.
For technical reasons that will become clear we actually return a struct with two copies of (A B). One has the metadata from A and the children of A call it A', and the other has them from B call it B'. This is relevant when we extend the shallow algorithm to work with a whole tree because we will recurse and compute the set operation for each pair of the children of A' and B'.
NB: Currently there are two kinds of values, QEnums, that store a list of values and Wildcards that 'match with everything'. shallow_set_operation checks the type of values and dispatches to different methods depending on the combination of types it finds.
"""
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass
from enum import Enum
# Prevent circular imports while allowing the type checker to know what Qube is
from typing import TYPE_CHECKING, Any, Iterable
import numpy as np
from frozendict import frozendict
from .value_types import QEnum, ValueGroup, WildcardGroup
if TYPE_CHECKING:
from .Qube import Qube
class SetOperation(Enum):
"Map from set operations to which combination of (A - B), (A ∩ B), (B - A) we need."
UNION = (1, 1, 1)
INTERSECTION = (0, 1, 0)
DIFFERENCE = (1, 0, 0)
SYMMETRIC_DIFFERENCE = (1, 0, 1)
@dataclass(eq=True, frozen=True)
class ValuesIndices:
"Helper class to hold the values and indices from a node."
values: ValueGroup
indices: tuple[int, ...]
@classmethod
def from_values(cls, values: ValueGroup):
return cls(values=values, indices=tuple(range(len(values))))
@classmethod
def empty(cls):
return cls(values=QEnum([]), indices=())
def enumerate(self) -> Iterable[tuple[Any, int]]:
return zip(self.indices, self.values)
def get_indices(
metadata: frozendict[str, np.ndarray], indices: tuple[int, ...]
) -> frozendict[str, np.ndarray]:
"Given a metadata dict and some indices, return a new metadata dict with only the values indexed by the indices"
return frozendict(
{k: v[..., indices] for k, v in metadata.items() if isinstance(v, np.ndarray)}
)
@dataclass(eq=True, frozen=True)
class SetOpResult:
"""
Given two sets A and B, all possible set operations can be constructed from A - B, A B, B - A
That is, what's only in A, the intersection and what's only in B
However because we need to recurse on children we actually return two intersection node:
only_A is a qube with:
The values in A but not in B
The metadata corresponding to this values
All the children A had
intersection_A is a qube with:
The values that intersected with B
The metadata from that intersection
All the children A had
And vice versa for only_B and intersection B
"""
only_A: ValuesIndices
intersection_A: ValuesIndices
intersection_B: ValuesIndices
only_B: ValuesIndices
def shallow_qenum_set_operation(A: ValuesIndices, B: ValuesIndices) -> SetOpResult:
"""
For two sets of values, partition the overlap into four groups:
only_A: values and indices of values that are in A but not B
intersection_A: values and indices of values that are in both A and B
And vice versa for only_B and intersection_B.
Note that intersection_A and intersection_B contain the same values but the indices are different.
"""
# create four groups that map value -> index
only_A: dict[Any, int] = {val: i for i, val in A.enumerate()}
only_B: dict[Any, int] = {val: i for i, val in B.enumerate()}
intersection_A: dict[Any, int] = {}
intersection_B: dict[Any, int] = {}
# Go through all the values and move any that are in the intersection
# to the corresponding group, keeping the indices
for val in A.values:
if val in B.values:
intersection_A[val] = only_A.pop(val)
intersection_B[val] = only_B.pop(val)
def package(values_indices: dict[Any, int]) -> ValuesIndices:
return ValuesIndices(
values=QEnum(list(values_indices.keys())),
indices=tuple(values_indices.values()),
)
return SetOpResult(
only_A=package(only_A),
only_B=package(only_B),
intersection_A=package(intersection_A),
intersection_B=package(intersection_B),
)
def shallow_wildcard_set_operation(A: ValuesIndices, B: ValuesIndices) -> SetOpResult:
"""
WildcardGroups behave as if they contain all the values of whatever they match against.
For two wildcards we just return both.
For A == wildcard and B == enum we have to be more careful:
1. All of B is in the intersection so only_B is None too.
2. The wildcard may need to match against other things so only_A is A
3. We return B in the intersection_B and intersection_A slot.
This last bit happens because the wildcard basically adopts the values of whatever it sees.
"""
# Two wildcard groups have full overlap.
if isinstance(A.values, WildcardGroup) and isinstance(B.values, WildcardGroup):
return SetOpResult(ValuesIndices.empty(), A, B, ValuesIndices.empty())
# If A is a wildcard matcher and B is not
# then the intersection is everything from B
if isinstance(A.values, WildcardGroup):
return SetOpResult(A, B, B, ValuesIndices.empty())
# If B is a wildcard matcher and A is not
# then the intersection is everything from A
if isinstance(B.values, WildcardGroup):
return SetOpResult(ValuesIndices.empty(), A, A, B)
raise NotImplementedError(
f"One of {type(A.values)} and {type(B.values)} should be WildCardGroup"
)
def shallow_set_operation(
A: ValuesIndices,
B: ValuesIndices,
) -> SetOpResult:
if isinstance(A.values, QEnum) and isinstance(B.values, QEnum):
return shallow_qenum_set_operation(A, B)
# WildcardGroups behave as if they contain all possible values.
if isinstance(A.values, WildcardGroup) or isinstance(B.values, WildcardGroup):
return shallow_wildcard_set_operation(A, B)
raise NotImplementedError(
f"Set operations on values types {type(A.values)} and {type(B.values)} not yet implemented"
)
def operation(
A: Qube, B: Qube, operation_type: SetOperation, node_type, depth=0
) -> Qube | None:
# print(f"operation({A}, {B})")
assert A.key == B.key, (
"The two Qube root nodes must have the same key to perform set operations,"
f"would usually be two root nodes. They have {A.key} and {B.key} respectively"
)
node_key = A.key
assert A.is_root == B.is_root
is_root = A.is_root
assert A.values == B.values, (
f"The two Qube root nodes must have the same values to perform set operations {A.values = }, {B.values = }"
)
node_values = A.values
# Group the children of the two nodes by key
nodes_by_key: defaultdict[str, tuple[list[Qube], list[Qube]]] = defaultdict(
lambda: ([], [])
)
new_children: list[Qube] = []
# Sort out metadata into what can stay at this level and what must move down
stayput_metadata: dict[str, np.ndarray] = {}
pushdown_metadata_A: dict[str, np.ndarray] = {}
pushdown_metadata_B: dict[str, np.ndarray] = {}
for key in set(A.metadata.keys()) | set(B.metadata.keys()):
if key not in A.metadata:
pushdown_metadata_B[key] = B.metadata[key]
continue
if key not in B.metadata:
pushdown_metadata_A[key] = A.metadata[key]
continue
A_val = A.metadata[key]
B_val = B.metadata[key]
if np.allclose(A_val, B_val):
# print(f"{' ' * depth}Keeping metadata key '{key}' at this level")
stayput_metadata[key] = A.metadata[key]
else:
# print(f"{' ' * depth}Pushing down metadata key '{key}' {A_val} {B_val}")
pushdown_metadata_A[key] = A_val
pushdown_metadata_B[key] = B_val
# Add all the metadata that needs to be pushed down to the child nodes
# When pushing down the metadata we need to account for the fact it now affects more values
# So expand the metadata entries from shape (a, b, ..., c) to (a, b, ..., c, d)
# where d is the length of the node values
for node in A.children:
N = len(node.values)
meta = {
k: np.broadcast_to(v[..., np.newaxis], v.shape + (N,))
for k, v in pushdown_metadata_A.items()
}
node = node.replace(metadata=node.metadata | meta)
nodes_by_key[node.key][0].append(node)
for node in B.children:
N = len(node.values)
meta = {
k: np.broadcast_to(v[..., np.newaxis], v.shape + (N,))
for k, v in pushdown_metadata_B.items()
}
node = node.replace(metadata=node.metadata | meta)
nodes_by_key[node.key][1].append(node)
# print(f"{nodes_by_key = }")
# For every node group, perform the set operation
for key, (A_nodes, B_nodes) in nodes_by_key.items():
output = list(
_operation(A_nodes, B_nodes, operation_type, node_type, depth + 1)
)
# print(f"{' '*depth}_operation {operation_type.name} {A_nodes} {B_nodes} out = [{output}]")
new_children.extend(output)
# print(f"{' '*depth}operation {operation_type.name} [{A}] [{B}] new_children = [{new_children}]")
# If there are now no children as a result of the operation, return nothing.
if (A.children or B.children) and not new_children:
if A.key == "root":
return node_type.make_root(children=())
else:
return None
# Whenever we modify children we should recompress them
# But since `operation` is already recursive, we only need to compress this level not all levels
# Hence we use the non-recursive _compress method
new_children = list(compress_children(new_children))
# The values and key are the same so we just replace the children
if A.key == "root":
return node_type.make_root(
children=new_children,
metadata=stayput_metadata,
)
return node_type.make_node(
key=node_key,
values=node_values,
children=new_children,
metadata=stayput_metadata,
is_root=is_root,
)
def _operation(
A: list[Qube],
B: list[Qube],
operation_type: SetOperation,
node_type,
depth: int,
) -> Iterable[Qube]:
"""
This operation assumes that we've found two nodes that match and now want to do a set operation on their children. Hence we take in two lists of child nodes all of which have the same key but different values.
We then loop over all pairs of children from each list and compute the intersection.
"""
# print(f"_operation({A}, {B})")
keep_only_A, keep_intersection, keep_only_B = operation_type.value
# We're going to progressively remove values from the starting nodes as we do intersections
# So we make a node -> ValuesIndices mapping here for both a and b
only_a: dict[Qube, ValuesIndices] = {
n: ValuesIndices.from_values(n.values) for n in A
}
only_b: dict[Qube, ValuesIndices] = {
n: ValuesIndices.from_values(n.values) for n in B
}
def make_new_node(source: Qube, values_indices: ValuesIndices):
return source.replace(
values=values_indices.values,
metadata=get_indices(source.metadata, values_indices.indices),
)
# Iterate over all pairs (node_A, node_B) and perform the shallow set operation
# Update our copy of the original node to remove anything that appears in an intersection
for node_a in A:
for node_b in B:
set_ops_result = shallow_set_operation(only_a[node_a], only_b[node_b])
# Save reduced values back to nodes
only_a[node_a] = set_ops_result.only_A
only_b[node_b] = set_ops_result.only_B
if (
set_ops_result.intersection_A.values
and set_ops_result.intersection_B.values
):
result = operation(
make_new_node(node_a, set_ops_result.intersection_A),
make_new_node(node_b, set_ops_result.intersection_B),
operation_type,
node_type,
depth=depth + 1,
)
if result is not None:
# If we're doing a difference or xor we might want to throw away the intersection
# However we can only do this once we get to the leaf nodes, otherwise we'll
# throw away nodes too early!
# Consider Qube(root, a=1, b=1/2) - Qube(root, a=1, b=1)
# We can easily throw away the whole a node by accident here!
if keep_intersection or result.children:
yield result
elif (
not set_ops_result.intersection_A.values
and not set_ops_result.intersection_B.values
):
continue
else:
raise ValueError(
f"Only one of set_ops_result.intersection_A and set_ops_result.intersection_B is None, I didn't think that could happen! {set_ops_result = }"
)
if keep_only_A:
for node, vi in only_a.items():
if vi.values:
yield make_new_node(node, vi)
if keep_only_B:
for node, vi in only_b.items():
if vi.values:
yield make_new_node(node, vi)
def compress_children(children: Iterable[Qube], depth=0) -> tuple[Qube, ...]:
"""
Helper method tht only compresses a set of nodes, and doesn't do it recursively.
Used in Qubed.compress but also to maintain compression in the set operations above.
"""
# Take the set of new children and see if any have identical key, metadata and children
# the values may different and will be collapsed into a single node
identical_children = defaultdict(list)
for child in children:
# only care about the key and children of each node, ignore values
h = hash((child.key, tuple((cc.structural_hash for cc in child.children))))
identical_children[h].append(child)
# Now go through and create new compressed nodes for any groups that need collapsing
new_children = []
for child_list in identical_children.values():
# If the group is size one just keep it
if len(child_list) == 1:
new_child = child_list.pop()
else:
example = child_list[0]
node_type = type(example)
value_type = type(example.values)
assert all(isinstance(child.values, value_type) for child in child_list), (
f"All nodes to be grouped must have the same value type, expected {value_type}"
)
# We know the children of this group of nodes all have the same structure
# but we still need to merge the metadata across them
# children = example.children
children = merge_metadata(child_list, example.depth)
# Do we need to recusively compress here?
# children = compress_children(children, depth=depth+1)
if value_type is QEnum:
values = QEnum(set(v for child in child_list for v in child.values))
elif value_type is WildcardGroup:
values = example.values
else:
raise ValueError(f"Unknown value type: {value_type}")
new_child = node_type.make_node(
key=example.key,
metadata=example.metadata,
values=values,
children=children,
)
new_children.append(new_child)
return tuple(sorted(new_children, key=lambda n: ((n.key, n.values.min()))))
def merge_metadata(qubes: list[Qube], axis) -> Iterable[Qube]:
"""
Given a list of qubes with identical structure,
match up the children of each node and merge the metadata
"""
# Group the children of each qube and merge them
# Exploit the fact that they have the same shape and ordering
example = qubes[0]
node_type = type(example)
for i in range(len(example.children)):
group = [q.children[i] for q in qubes]
group_example = group[0]
assert len(set((c.structural_hash for c in group))) == 1
# Collect metadata by key
metadata_groups = {
k: [q.metadata[k] for q in group] for k in group_example.metadata.keys()
}
# Concatenate the metadata together
metadata: frozendict[str, np.ndarray] = frozendict(
{
k: np.concatenate(metadata_group, axis=axis)
for k, metadata_group in metadata_groups.items()
}
)
group_children = merge_metadata(group, axis)
yield node_type.make_node(
key=group_example.key,
metadata=metadata,
values=group_example.values,
children=group_children,
)

View File

@ -0,0 +1,271 @@
from __future__ import annotations
import random
from dataclasses import dataclass
from typing import TYPE_CHECKING, Callable, Iterable
try:
from IPython.display import display
except ImportError:
display = None
if TYPE_CHECKING:
from .Qube import Qube
@dataclass(frozen=True)
class HTML:
html: str
def _repr_html_(self):
return self.html
def summarize_node(
node: Qube, collapse=False, max_summary_length=50, **kwargs
) -> tuple[str, str, Qube]:
"""
Extracts a summarized representation of the node while collapsing single-child paths.
Returns the summary string and the last node in the chain that has multiple children.
"""
summaries = []
paths = []
while True:
summary = node.summary(**kwargs)
paths.append(summary)
if len(summary) > max_summary_length:
summary = summary[:max_summary_length] + "..."
summaries.append(summary)
if not collapse:
break
# Move down if there's exactly one child, otherwise stop
if len(node.children) != 1:
break
node = node.children[0]
# Add a "..." to represent nodes that we don't know about
if (not node.children) and (not node.is_leaf):
summaries.append("...")
return ", ".join(summaries), ",".join(paths), node
def node_tree_to_string(node: Qube, prefix: str = "", depth=None) -> Iterable[str]:
summary, path, node = summarize_node(node)
if depth is not None and depth <= 0:
yield summary + " - ...\n"
return
# Special case for nodes with only a single child, this makes the printed representation more compact
elif len(node.children) == 1:
yield summary + ", "
yield from node_tree_to_string(node.children[0], prefix, depth=depth)
return
else:
yield summary + "\n"
for index, child in enumerate(node.children):
connector = "└── " if index == len(node.children) - 1 else "├── "
yield prefix + connector
extension = " " if index == len(node.children) - 1 else ""
yield from node_tree_to_string(
child, prefix + extension, depth=depth - 1 if depth is not None else None
)
def summarize_node_html(
node: Qube,
collapse=False,
max_summary_length=50,
info: Callable[[Qube], str] | None = None,
**kwargs,
) -> tuple[str, Qube]:
"""
Extracts a summarized representation of the node while collapsing single-child paths.
Returns the summary string and the last node in the chain that has multiple children.
"""
if info is None:
def info_func(node: Qube, /):
return (
# f"dtype: {node.dtype}\n"
f"metadata: {dict(node.metadata)}\n"
)
else:
info_func = info
summaries = []
while True:
path = node.summary(**kwargs)
summary = path
if len(summary) > max_summary_length:
summary = summary[:max_summary_length] + "..."
info_string = info_func(node)
summary = f'<span class="qubed-node" data-path="{path}" title="{info_string}">{summary}</span>'
summaries.append(summary)
if not collapse:
break
# Move down if there's exactly one child, otherwise stop
if len(node.children) != 1:
break
node = node.children[0]
if (not node.children) and (not node.is_leaf):
summary = (
'<span class="qubed-node" data-path="" title="Truncated Nodes">...</span>'
)
summaries.append(summary)
return ", ".join(summaries), node
def _node_tree_to_html(
node: Qube,
prefix: str = "",
depth=1,
connector="",
info: Callable[[Qube], str] | None = None,
**kwargs,
) -> Iterable[str]:
summary, node = summarize_node_html(node, info=info, **kwargs)
if len(node.children) == 0:
yield f'<span class="qubed-level">{connector}{summary}</span>'
return
else:
open = "open" if depth > 0 else ""
yield f'<details {open}><summary class="qubed-level">{connector}{summary}</summary>'
for index, child in enumerate(node.children):
connector = "└── " if index == len(node.children) - 1 else "├── "
extension = " " if index == len(node.children) - 1 else ""
yield from _node_tree_to_html(
child,
prefix + extension,
depth=depth - 1,
connector=prefix + connector,
info=info,
**kwargs,
)
yield "</details>"
def node_tree_to_html(
node: Qube,
depth=1,
include_css=True,
include_js=True,
css_id=None,
info: Callable[[Qube], str] | None = None,
**kwargs,
) -> str:
if css_id is None:
css_id = f"qubed-tree-{random.randint(0, 1000000)}"
# It's ugle to use an f string here because css uses {} so much so instead
# we use CSS_ID as a placeholder and replace it later
css = """
<style>
pre#CSS_ID {
font-family: monospace;
white-space: pre;
font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;
font-size: 12px;
line-height: 1.4;
details {
margin-left: 0;
}
.qubed-level a {
margin-left: 10px;
text-decoration: none;
}
summary {
list-style: none;
cursor: pointer;
text-overflow: ellipsis;
overflow: hidden;
text-wrap: nowrap;
display: block;
}
span.qubed-node:hover {
background-color: #f0f0f0;
}
details > summary::after {
content: '';
}
details:not([open]) > summary::after {
content: "";
}
.qubed-level {
text-overflow: ellipsis;
overflow: hidden;
text-wrap: nowrap;
display: block;
}
summary::-webkit-details-marker {
display: none;
content: "";
}
}
</style>
""".replace("CSS_ID", css_id)
# This js snippet copies the path of a node to the clipboard when clicked
js = """
<script type="module" defer>
async function nodeOnClick(event) {
if (!event.altKey) return;
event.preventDefault();
let current_element = this.parentElement;
let paths = [];
while (true) {
if (current_element.dataset.path) {
paths.push(current_element.dataset.path);
}
current_element = current_element.parentElement;
if (current_element.tagName == "PRE") break;
}
const path = paths.reverse().slice(1).join(",");
await navigator.clipboard.writeText(path);
}
const nodes = document.querySelectorAll("#CSS_ID.qubed-node");
nodes.forEach(n => n.addEventListener("click", nodeOnClick));
</script>
""".replace("CSS_ID", css_id)
nodes = "".join(_node_tree_to_html(node=node, depth=depth, info=info, **kwargs))
return f"{js if include_js else ''}{css if include_css else ''}<pre class='qubed-tree' id='{css_id}'>{nodes}</pre>"
def _display(qube: Qube, **kwargs):
if display is None:
print(qube)
else:
def info(node: Qube):
return f"""\
structural_hash = {node.structural_hash}
metadata = {dict(node.metadata)}
is_root = {node.is_root}
is_leaf = {node.is_leaf}
"""
kwargs = {"info": info} | kwargs
display(qube.html(**kwargs))

40
src/python/qubed/trie.py Normal file
View File

@ -0,0 +1,40 @@
from dataclasses import dataclass, field
character = str
@dataclass(unsafe_hash=True)
class TrieNode:
parent: "TrieNode | None"
parent_char: character
children: dict[character, "TrieNode"] = field(default_factory=dict)
@dataclass
class Trie:
root: TrieNode = field(default_factory=lambda: TrieNode(None, ""))
reverse_lookup: dict[int, TrieNode] = field(default_factory=dict)
def insert(self, word: str):
node = self.root
for char in word:
if char not in node.children:
new_node = TrieNode(node, char)
node.children[char] = new_node
node = node.children[char]
n_id = id(node)
if n_id not in self.reverse_lookup:
self.reverse_lookup[n_id] = node
return n_id
def lookup_by_id(self, n_id: int):
leaf_node = self.reverse_lookup[n_id]
string = []
while leaf_node.parent is not None:
string.append(leaf_node.parent_char)
leaf_node = leaf_node.parent
return "".join(reversed(string))

View File

@ -0,0 +1,443 @@
from __future__ import annotations
import dataclasses
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from typing import (
TYPE_CHECKING,
Any,
FrozenSet,
Iterable,
Iterator,
Literal,
Sequence,
TypeVar,
)
if TYPE_CHECKING:
from .Qube import Qube
@dataclass(frozen=True)
class ValueGroup(ABC):
@abstractmethod
def dtype(self) -> str:
"Provide a string rep of the datatype of these values"
pass
@abstractmethod
def summary(self) -> str:
"Provide a string summary of the value group."
pass
@abstractmethod
def __contains__(self, value: Any) -> bool:
"Given a value, coerce to the value type and determine if it is in the value group."
pass
@abstractmethod
def to_json(self) -> dict:
"Return a JSON serializable representation of the value group."
pass
@abstractmethod
def min(self):
"Return the minimum value in the group."
pass
@classmethod
@abstractmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[ValueGroup]:
"Given a list of strings, return a one or more ValueGroups of this type."
pass
@abstractmethod
def __iter__(self) -> Iterator:
"Iterate over the values in the group."
pass
@abstractmethod
def __len__(self) -> int:
pass
T = TypeVar("T")
EnumValuesType = FrozenSet[T]
_dtype_map: dict[str, type] = {
"str": str,
"int64": int,
"float64": float,
"date": datetime,
}
_dtype_map_inv: dict[type, str] = {v: k for k, v in _dtype_map.items()}
_dtype_formatters = {
"str": str,
"int64": int,
"float64": float,
"date": datetime.fromisoformat,
}
@dataclass(frozen=True, order=True)
class QEnum(ValueGroup):
"""
The simplest kind of key value is just a list of strings.
summary -> string1/string2/string....
"""
values: EnumValuesType
_dtype: str = "str"
def __init__(self, obj, dtype="str"):
object.__setattr__(self, "values", tuple(sorted(obj)))
object.__setattr__(
self,
"_dtype",
dtype,
)
def __post_init__(self):
assert isinstance(self.values, tuple)
def __iter__(self):
return iter(self.values)
def __len__(self) -> int:
return len(self.values)
def summary(self) -> str:
return "/".join(map(str, sorted(self.values)))
def __contains__(self, value: Any) -> bool:
return value in self.values
def dtype(self):
return self._dtype
@classmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[ValueGroup]:
return [cls(tuple(values))]
def min(self):
return min(self.values)
def to_json(self):
return {"type": "enum", "dtype": self.dtype(), "values": self.values}
# @classmethod
# def from_json(cls, type: Literal["enum"], dtype: str, values: list):
# dtype_formatter = _dtype_formatters[dtype]
@classmethod
def from_list(cls, obj):
example = obj[0]
dtype = type(example)
assert [type(v) is dtype for v in obj]
return cls(obj, dtype=_dtype_map_inv[dtype])
@dataclass(frozen=True, order=True)
class WildcardGroup(ValueGroup):
def summary(self) -> str:
return "*"
def __contains__(self, value: Any) -> bool:
return True
def to_json(self):
return "*"
def min(self):
return "*"
def __len__(self):
return 1
def __iter__(self):
return ["*"]
def __bool__(self):
return True
def dtype(self):
return "*"
@classmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[ValueGroup]:
return [WildcardGroup()]
class DateEnum(QEnum):
def summary(self) -> str:
def fmt(d):
return d.strftime("%Y%m%d")
return "/".join(map(fmt, sorted(self.values)))
@dataclass(frozen=True)
class Range(ValueGroup, ABC):
dtype: str = dataclasses.field(kw_only=True)
start: Any
end: Any
step: Any
def min(self):
return self.start
def __iter__(self) -> Iterator[Any]:
i = self.start
while i <= self.end:
yield i
i += self.step
def to_json(self):
return dataclasses.asdict(self)
@dataclass(frozen=True)
class DateRange(Range):
start: date
end: date
step: timedelta
dtype: Literal["date"] = dataclasses.field(kw_only=True, default="date")
def __len__(self) -> int:
return (self.end - self.start) // self.step
def __iter__(self) -> Iterator[date]:
current = self.start
while current <= self.end if self.step.days > 0 else current >= self.end:
yield current
current += self.step
@classmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[DateRange | DateEnum]:
dates = sorted([datetime.strptime(v, "%Y%m%d") for v in values])
if len(dates) < 2:
return [DateEnum(dates)]
ranges: list[DateEnum | DateRange] = []
current_group, dates = (
[
dates[0],
],
dates[1:],
)
current_type: Literal["enum", "range"] = "enum"
while len(dates) > 1:
if current_type == "range":
# If the next date fits then add it to the current range
if dates[0] - current_group[-1] == timedelta(days=1):
current_group.append(dates.pop(0))
# Emit the current range and start a new one
else:
if len(current_group) == 1:
ranges.append(DateEnum(current_group))
else:
ranges.append(
DateRange(
start=current_group[0],
end=current_group[-1],
step=timedelta(days=1),
)
)
current_group = [
dates.pop(0),
]
current_type = "enum"
if current_type == "enum":
# If the next date is one more than the last then switch to range mode
if dates[0] - current_group[-1] == timedelta(days=1):
last = current_group.pop()
if current_group:
ranges.append(DateEnum(current_group))
current_group = [last, dates.pop(0)]
current_type = "range"
else:
current_group.append(dates.pop(0))
# Handle remaining `current_group`
if current_group:
if current_type == "range":
ranges.append(
DateRange(
start=current_group[0],
end=current_group[-1],
step=timedelta(days=1),
)
)
else:
ranges.append(DateEnum(current_group))
return ranges
def __contains__(self, value: Any) -> bool:
v = datetime.strptime(value, "%Y%m%d").date()
return self.start <= v <= self.end and (v - self.start) % self.step == 0
def summary(self) -> str:
def fmt(d):
return d.strftime("%Y%m%d")
if self.step == timedelta(days=0):
return f"{fmt(self.start)}"
if self.step == timedelta(days=1):
return f"{fmt(self.start)}/to/{fmt(self.end)}"
return (
f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step // timedelta(days=1)}"
)
@dataclass(frozen=True)
class TimeRange(Range):
start: int
end: int
step: int
dtype: Literal["time"] = dataclasses.field(kw_only=True, default="time")
def min(self):
return self.start
def __iter__(self) -> Iterator[Any]:
return super().__iter__()
@classmethod
def from_strings(self, values: Iterable[str]) -> list["TimeRange"]:
times = sorted([int(v) for v in values])
if len(times) < 2:
return [TimeRange(start=times[0], end=times[0], step=100)]
ranges = []
current_range, times = (
[
times[0],
],
times[1:],
)
while len(times) > 1:
if times[0] - current_range[-1] == 1:
current_range.append(times.pop(0))
elif len(current_range) == 1:
ranges.append(
TimeRange(start=current_range[0], end=current_range[0], step=0)
)
current_range = [
times.pop(0),
]
else:
ranges.append(
TimeRange(start=current_range[0], end=current_range[-1], step=1)
)
current_range = [
times.pop(0),
]
return ranges
def __len__(self) -> int:
return (self.end - self.start) // self.step
def summary(self) -> str:
def fmt(d):
return f"{d:04d}"
if self.step == 0:
return f"{fmt(self.start)}"
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
def __contains__(self, value: Any) -> bool:
v = int(value)
return self.start <= v <= self.end and (v - self.start) % self.step == 0
@dataclass(frozen=True)
class IntRange(Range):
start: int
end: int
step: int
dtype: Literal["int"] = dataclasses.field(kw_only=True, default="int")
def __len__(self) -> int:
return (self.end - self.start) // self.step
def summary(self) -> str:
def fmt(d):
return d
if self.step == 0:
return f"{fmt(self.start)}"
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
def __contains__(self, value: Any) -> bool:
v = int(value)
return self.start <= v <= self.end and (v - self.start) % self.step == 0
@classmethod
def from_strings(self, values: Iterable[str]) -> list["IntRange"]:
ints = sorted([int(v) for v in values])
if len(ints) < 2:
return [IntRange(start=ints[0], end=ints[0], step=0)]
ranges = []
current_range, ints = (
[
ints[0],
],
ints[1:],
)
while len(ints) > 1:
if ints[0] - current_range[-1] == 1:
current_range.append(ints.pop(0))
elif len(current_range) == 1:
ranges.append(
IntRange(start=current_range[0], end=current_range[0], step=0)
)
current_range = [
ints.pop(0),
]
else:
ranges.append(
IntRange(start=current_range[0], end=current_range[-1], step=1)
)
current_range = [
ints.pop(0),
]
return ranges
def values_from_json(obj: dict | list) -> ValueGroup:
if isinstance(obj, list):
return QEnum.from_list(obj)
match obj["type"]:
case "enum":
QEnum.from_json(**obj)
case _:
raise ValueError(f"Unknown dtype {obj['dtype']}")
def convert_datatypes(q: "Qube", conversions: dict[str, ValueGroup]) -> "Qube":
def _convert(q: "Qube") -> Iterator["Qube"]:
if q.key in conversions:
data_type = conversions[q.key]
assert isinstance(q.values, QEnum), (
"Only QEnum values can be converted to other datatypes."
)
for values_group in data_type.from_strings(q.values):
# print(values_group)
yield q.replace(values=values_group)
else:
yield q
return q.transform(_convert)

32
src/qube.proto Normal file
View File

@ -0,0 +1,32 @@
syntax = "proto3";
message NdArray {
repeated int64 shape = 1;
string dtype = 2;
bytes raw = 3;
}
message StringGroup {repeated string items = 1; }
// Stores values i.e class=1/2/3 the 1/2/3 part
message ValueGroup {
oneof payload {
StringGroup s = 1;
NdArray tensor = 2;
}
}
message MetadataGroup {
oneof payload {
NdArray tensor = 1;
}
}
message Qube {
string key = 1;
ValueGroup values = 2;
map<string, MetadataGroup> metadata = 3;
string dtype = 4;
repeated Qube children = 5;
bool is_root = 6;
}

View File

@ -1,7 +1,3 @@
#![allow(unused_imports)]
#![allow(dead_code)]
#![allow(unused_variables)]
use rsfdb::listiterator::KeyValueLevel;
use rsfdb::request::Request;
use rsfdb::FDB;
@ -9,8 +5,6 @@ use rsfdb::FDB;
use serde_json::{json, Value};
use std::time::Instant;
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyInt, PyList, PyString};
use std::collections::HashMap;

147
src/rust/formatters/mod.rs Normal file
View File

@ -0,0 +1,147 @@
use crate::{Node, NodeId, Qube};
use itertools::Itertools;
use itertools::Position;
impl Node {
/// Generate a human readable summary of the node
/// Examples include: key=value1/value2/.../valueN, key=value1/to/value1, key=*, root etc
pub fn summary(&self, qube: &Qube) -> String {
if self.is_root() {
return "root".to_string();
}
let key = &qube[self.key];
let values: String =
Itertools::intersperse(self.values.iter().map(|id| &qube[*id]), "/").collect();
format!("{}={}", key, values)
}
pub fn html_summary(&self, qube: &Qube) -> String {
if self.is_root() {
return r#"<span class="qubed-node">root</span>"#.to_string();
}
let key = &qube[self.key];
let values: String =
Itertools::intersperse(self.values.iter().map(|id| &qube[*id]), "/").collect();
let summary = format!("{}={}", key, values);
let path = summary.clone();
let info = format!("is_root: {}", self.is_root());
format!(r#"<span class="qubed-node" data-path="{path}" title="{info}">{summary}</span>"#)
}
}
struct NodeSummary {
summary: String,
end: NodeId,
}
enum SummaryType {
PlainText,
HTML,
}
/// Given a Node, traverse the tree until a node has more than one child.
/// Returns a summary of the form "key1=v1/v2, key2=v1/v2/v3, key3=v1"
/// and the id of the last node in the summary
fn summarise_nodes(qube: &Qube, node_id: &NodeId, summary_type: SummaryType) -> NodeSummary {
let mut node_id = *node_id;
let mut summary_vec = vec![];
loop {
let node = &qube[node_id];
let summary = match summary_type {
SummaryType::PlainText => node.summary(&qube),
SummaryType::HTML => node.html_summary(&qube),
};
summary_vec.push(summary);
// Bail out if the node has anothing other than 1 child.
match node.has_exactly_one_child() {
Some(n) => node_id = n,
None => break,
};
}
NodeSummary {
summary: summary_vec.join(", "),
end: node_id,
}
}
fn qube_to_tree(qube: &Qube, node_id: &NodeId, prefix: &str, depth: usize) -> String {
let NodeSummary {
summary,
end: node_id,
} = summarise_nodes(qube, node_id, SummaryType::PlainText);
let mut output: Vec<String> = Vec::new();
if depth <= 0 {
return format!("{} - ...\n", summary);
} else {
output.push(format!("{}\n", summary));
}
let node = &qube[node_id];
for (position, child_id) in node.children().with_position() {
let (connector, extension) = match position {
Position::Last | Position::Only => ("└── ", " "),
_ => ("├── ", ""),
};
output.extend([
prefix.to_string(),
connector.to_string(),
qube_to_tree(qube, child_id, &format!("{prefix}{extension}"), depth - 1),
]);
}
output.join("")
}
fn qube_to_html(qube: &Qube, node_id: &NodeId, prefix: &str, depth: usize) -> String {
let NodeSummary {
summary,
end: node_id,
} = summarise_nodes(qube, node_id, SummaryType::PlainText);
let node = &qube[node_id];
let mut output: Vec<String> = Vec::new();
let open = if depth > 0 { "open" } else { "" };
output.push(format!(
r#"<details {open}><summary class="qubed-level">{summary}</summary>"#
));
for (position, child_id) in node.children().with_position() {
let (connector, extension) = match position {
Position::Last | Position::Only => ("└── ", " "),
_ => ("├── ", ""),
};
output.extend([
prefix.to_string(),
connector.to_string(),
qube_to_tree(qube, child_id, &format!("{prefix}{extension}"), depth - 1),
]);
}
output.join("")
}
impl Qube {
/// Return a string version of the Qube in the format
/// root
/// ├── class=od, expver=0001/0002, param=1/2
/// └── class=rd, param=1/2/3
pub fn string_tree(&self) -> String {
qube_to_tree(&self, &self.root, "", 5)
}
/// Return an HTML version of the Qube which renders like this
/// root
/// ├── class=od, expver=0001/0002, param=1/2
/// └── class=rd, param=1/2/3
/// But under the hood children are represented with a details/summary tag and each key=value is a span
/// CSS and JS functionality is bundled inside.
pub fn html_tree(&self) -> String {
qube_to_html(&self, &self.root, "", 5)
}
}

235
src/rust/lib.rs Normal file
View File

@ -0,0 +1,235 @@
#![allow(unused_imports)]
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
use pyo3::types::{PyDict, PyInt, PyList, PyString};
use python_interface::QubeError;
use std::collections::HashMap;
use std::iter;
use pyo3::prelude::*;
use std::hash::Hash;
use std::rc::Rc;
use lasso::{Rodeo, Spur};
use std::num::NonZero;
use std::ops;
mod serialisation;
mod python_interface;
mod formatters;
mod set_operations;
// This data structure uses the Newtype Index Pattern
// See https://matklad.github.io/2018/06/04/newtype-index-pattern.html
// See also https://github.com/nrc/r4cppp/blob/master/graphs/README.md#rcrefcellnode for a discussion of other approaches to trees and graphs in rust.
// https://smallcultfollowing.com/babysteps/blog/2015/04/06/modeling-graphs-in-rust-using-vector-indices/
// Index types use struct Id(NonZero<usize>)
// This reserves 0 as a special value which allows Option<Id(NonZero<usize>)> to be the same size as usize.
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub(crate) struct NodeId(NonZero<usize>);
// Allow node indices to index directly into Qubes:
impl ops::Index<NodeId> for Qube {
type Output = Node;
fn index(&self, index: NodeId) -> &Node {
&self.nodes[index.0.get() - 1]
}
}
impl ops::IndexMut<NodeId> for Qube {
fn index_mut(&mut self, index: NodeId) -> &mut Node {
&mut self.nodes[index.0.get() - 1]
}
}
impl ops::Index<StringId> for Qube {
type Output = str;
fn index(&self, index: StringId) -> &str {
&self.strings[index]
}
}
impl NodeId {
pub fn new(value: usize) -> Option<NodeId> {
NonZero::new(value).map(NodeId)
}
}
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
struct StringId(lasso::Spur);
impl ops::Index<StringId> for lasso::Rodeo {
type Output = str;
fn index(&self, index: StringId) -> &str {
&self[index.0]
}
}
#[derive(Debug, Clone)]
pub(crate) struct Node {
pub key: StringId,
pub metadata: HashMap<StringId, Vec<String>>,
pub parent: Option<NodeId>, // If not present, it's the root node
pub values: Vec<StringId>,
pub children: HashMap<StringId, Vec<NodeId>>,
}
impl Node {
fn new_root(q: &mut Qube) -> Node {
Node {
key: q.get_or_intern("root"),
metadata: HashMap::new(),
parent: None,
values: vec![],
children: HashMap::new(),
}
}
fn children(&self) -> impl Iterator<Item = &NodeId> {
self.children.values().flatten()
}
fn is_root(&self) -> bool {
self.parent.is_none()
}
/// Because children are stored grouped by key
/// determining the number of children quickly takes a little effort.
/// This is a fast method for the special case of checking if a Node has exactly one child.
/// Returns Ok(NodeId) if there is one child else None
fn has_exactly_one_child(&self) -> Option<NodeId> {
if self.children.len() != 1 {return None}
let Some(value_group) = self.children.values().next() else {return None};
let [node_id] = &value_group.as_slice() else {return None};
Some(*node_id)
}
fn n_children(&self) -> usize {
self.children
.values()
.map(|v| v.len())
.sum()
}
fn keys<'a>(&'a self, q: &'a Qube) -> impl Iterator<Item = &'a str> {
self.children.keys()
.map(|s| {&q[*s]})
}
}
#[derive(Debug, Clone)]
#[pyclass(subclass, dict)]
pub struct Qube {
pub root: NodeId,
nodes: Vec<Node>,
strings: Rodeo,
}
impl Qube {
pub fn new() -> Self {
let mut q = Self {
root: NodeId::new(1).unwrap(),
nodes: Vec::new(),
strings: Rodeo::default(),
};
let root = Node::new_root(&mut q);
q.nodes.push(root);
q
}
fn get_or_intern(&mut self, val: &str) -> StringId {
StringId(self.strings.get_or_intern(val))
}
pub(crate) fn add_node(&mut self, parent: NodeId, key: &str, values: impl IntoIterator<Item = impl AsRef<str>>) -> NodeId {
let key_id = self.get_or_intern(key);
let values = values.into_iter().map(|val| self.get_or_intern(val.as_ref())).collect();
// Create the node object
let node = Node {
key: key_id,
metadata: HashMap::new(),
values: values,
parent: Some(parent),
children: HashMap::new(),
};
// Insert it into the Qube arena and determine its id
self.nodes.push(node);
let node_id = NodeId::new(self.nodes.len()).unwrap();
// Add a reference to this node's id to the parents list of children.
let parent_node = &mut self[parent];
let key_group = parent_node.children.entry(key_id).or_insert(Vec::new());
key_group.push(node_id);
node_id
}
fn print(&self, node_id: Option<NodeId>) -> String {
let node_id: NodeId = node_id.unwrap_or(self.root);
let node = &self[node_id];
node.summary(&self)
}
fn get_node_ref(&self, id: NodeId) -> NodeRef {
let node = &self[id];
NodeRef { id: id, node: &node, qube: &self }
}
pub fn get_string_id(&self, s: &str) -> Option<StringId> {
self.strings.get(s)
.map(|id| StringId(id))
}
}
#[pymodule]
fn rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<Qube>()?;
m.add("QubeError", py.get_type::<python_interface::QubeError>())?;
Ok(())
}
pub struct NodeRef<'a> {
pub id: NodeId,
pub node: &'a Node,
pub qube: &'a Qube,
}
impl<'a> NodeRef<'a> {
pub fn keys(&self) -> impl Iterator<Item = &str> {
self.node.keys(self.qube)
}
fn flat_children(&'a self) -> impl Iterator<Item = Self> {
self.node.children
.values()
.flatten()
.map(|id| {
NodeRef { id: *id, node: &self.qube[*id], qube: self.qube }
})
}
fn children_by_key(&'a self, key: &str) -> impl Iterator<Item = Self> {
let id = self.qube.get_string_id(key);
let children = id
.map(|i| self.node.children.get(&i))
.flatten();
children.map(
|ids| ids.into_iter().map(
|id| {
NodeRef { id: *id, node: &self.qube[*id], qube: self.qube }
})).into_iter().flatten()
}
}

View File

@ -0,0 +1,179 @@
use crate::{Node, NodeId, Qube, NodeRef};
use pyo3::prelude::*;
use pyo3::types::{PyList, PyType};
use core::borrow;
use std::ops::Deref;
use std::cell::Ref;
use crate::set_operations;
use crate::serialisation;
use itertools::Itertools;
use pyo3::create_exception;
create_exception!(qubed, QubeError, pyo3::exceptions::PyException);
/// A reference to a particular node in a Qube
#[pyclass]
pub struct PyNodeRef {
id: NodeId,
qube: Py<Qube>, // see https://pyo3.rs/v0.23.1/types for a discussion of Py<T> and Bound<'py, T>
}
fn into_py_node_ref(node_ref: NodeRef, qube: Py<Qube>) -> PyNodeRef {
PyNodeRef {
id: node_ref.id,
qube: qube,
}
}
#[pymethods]
impl PyNodeRef {
fn __repr__(&self, py: Python) -> PyResult<String> {
// Get the Py<Qube> reference, bind it to the GIL.
let qube = self.qube.bind(py);
fn repr_helper<'py>(node_id: NodeId, qube: &Bound<'py, Qube>) -> String {
let node = &qube.borrow()[node_id];
let key = &qube.borrow()[node.key];
let children = node
.children
.values()
.flatten()
.map(|child_id| repr_helper(child_id.clone(), qube))
.collect::<Vec<String>>()
.join(", ");
format!("Node({}, {})", key, children)
}
Ok(repr_helper(self.id, qube))
}
fn __str__(&self, py: Python) -> String {
let qube = self.qube.bind(py).borrow();
let node = &qube[self.id];
let key = &qube.strings[node.key];
format!("Node({})", key)
}
#[getter]
pub fn get_children(&self, py: Python) -> Vec<Self> {
let qube = self.qube.bind(py).borrow();
let node = &qube[self.id];
node.children
.values()
.flatten()
.map(|child_id| Self {
id: *child_id,
qube: self.qube.clone_ref(py),
})
.collect()
}
}
#[derive(FromPyObject)]
pub enum OneOrMany<T> {
One(T),
Many(Vec<T>),
}
// Todo: Is there a way to rewrite this so that is doesn't allocate?
// Perhaps by returning an iterator?
impl<T> Into<Vec<T>> for OneOrMany<T> {
fn into(self) -> Vec<T> {
match self {
OneOrMany::One(v) => vec![v],
OneOrMany::Many(vs) => vs,
}
}
}
#[pymethods]
impl Qube {
#[new]
pub fn py_new() -> Self {
Qube::new()
}
#[pyo3(name = "add_node")]
pub fn py_add_node(
slf: Bound<'_, Self>,
parent: PyRef<'_, PyNodeRef>,
key: &str,
values: OneOrMany<String>,
) -> PyResult<PyNodeRef> {
// Check that the given parent is actually in this qube and not another one
if !parent.qube.bind(slf.py()).is(&slf) {
return Err(QubeError::new_err("Supplied parent node is not in the target qube."))
}
// massage values from T | Vec<T> into Vec<T>
let values: Vec<String> = values.into();
let mut q = slf.borrow_mut();
let node_id = q.add_node(parent.id, key, &values);
Ok(PyNodeRef { id: node_id, qube: slf.into()})
}
pub fn set_root(
slf: Bound<'_, Self>,
node: PyRef<'_, PyNodeRef>,
) -> () {
let mut q = slf.borrow_mut();
q.root = node.id;
}
#[getter]
fn get_root(slf: Bound<'_, Self>) -> PyResult<PyNodeRef> {
Ok(PyNodeRef {
id: slf.borrow().root,
qube: slf.unbind(),
})
}
fn __repr__(&self) -> String {
// format!("{:?}", self)
let nodes_str: String = self.nodes.iter()
.enumerate()
.map(|(id, node)| {
format!("{{id: {}, key: {}, values: [{}], children: [{}]}}",
id+1,
&self[node.key],
node.values.iter().map(|s| &self[*s]).join(", "),
node.children().map(|n| n.0).join(", "),
)
}).join(", ");
format!("Qube {{root: {}, nodes: {}}}", self.root.0, nodes_str)
}
fn __str__<'py>(&self) -> String {
self.string_tree()
}
fn _repr_html_(&self) -> String {
self.html_tree()
}
#[pyo3(name = "print")]
fn py_print(&self) -> String {
self.print(Option::None)
}
#[getter]
pub fn get_children(slf: Bound<'_, Self>, py: Python) -> PyResult<Vec<PyNodeRef>> {
let root = PyNodeRef {
id: slf.borrow().root,
qube: slf.unbind(),
};
Ok(root.get_children(py))
}
#[staticmethod]
pub fn from_json(data: &str) -> Result<Self, serialisation::JSONError> {
serialisation::from_json(data)
}
pub fn __or__(slf: Bound<'_, Self>, other: Bound<'_, Qube>) -> Qube {
set_operations::set_operation(&slf.borrow(), &other.borrow(), set_operations::Op::Union)
}
}

View File

@ -0,0 +1,80 @@
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;
use crate::{Node, NodeId, Qube};
// Use a newtype wrapper to allow us to implement auto conversion from serde_json::Error to PyErr
// via a wrapper intermediate
// see https://pyo3.rs/main/function/error-handling.html#foreign-rust-error-types
pub struct JSONError(serde_json::Error);
impl From<JSONError> for PyErr {
fn from(error: JSONError) -> Self {
PyValueError::new_err(format!("{}", error.0))
}
}
impl From<serde_json::Error> for JSONError {
fn from(other: serde_json::Error) -> Self {
Self(other)
}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "dtype")]
enum Ranges {
Int64{values: Vec<(i64, i64)>}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "dtype", rename_all = "lowercase")]
enum Enum {
Str{values: Vec<String>}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "type", rename_all = "lowercase")]
enum Values {
Wildcard{},
Enum(Enum),
Range(Ranges)
}
#[derive(Serialize, Deserialize, Debug)]
struct JSONQube {
key: String,
values: Values,
metadata: HashMap<String, String>,
children: Vec<JSONQube>,
}
fn add_nodes(qube: &mut Qube, parent: NodeId, nodes: &[JSONQube]) -> Vec<NodeId> {
nodes
.iter()
.map(|json_node| {
let values = match &json_node.values {
Values::Wildcard{} => &vec!["*"],
Values::Enum(Enum::Str{values}) => &values.iter().map(|s| s.as_str()).collect(),
Values::Range(_) => todo!(),
};
let node_id = qube.add_node(parent, &json_node.key, values);
//
add_nodes(qube, node_id, &json_node.children);
node_id
})
.collect()
}
pub fn from_json(data: &str) -> Result<Qube, JSONError> {
// Parse the string of data into serde_json::Value.
let json_qube: JSONQube = serde_json::from_str(data).expect("JSON parsing failed");
let mut qube = Qube::new();
let root = qube.root;
add_nodes(&mut qube, root, &json_qube.children);
Ok(qube)
}

View File

@ -0,0 +1,2 @@
mod json;
pub use json::{from_json, JSONError};

View File

@ -0,0 +1,40 @@
use crate::NodeRef;
use crate::{Node, NodeId, Qube};
use itertools::chain;
use std::collections::HashSet;
pub enum Op {
Union,
Intersection,
Difference,
SymmetricDifference,
}
fn op_to_venn_diagram(op: Op) -> (bool, bool, bool) {
use Op::*;
match op {
Union => (true, true, true),
Intersection => (false, true, false),
Difference => (true, false, false),
SymmetricDifference => (true, false, true),
}
}
pub fn set_operation<'a>(a: &'a Qube, b: &'a Qube, op: Op) -> Qube {
todo!()
// _set_operation(a.root_ref(), a.root_ref(), op)
}
// fn _set_operation<'a>(a: NodeRef, b: NodeRef, op: Op) -> Qube {
// let keys: HashSet<&str> = HashSet::from_iter(chain(a.keys(), b.keys()));
// for key in keys {
// let a = a.children_by_key(key)
// }
// todo!()
// }
pub fn set_operation_inplace<'a>(a: &'a mut Qube, b: &'a Qube, op: Op) -> &'a Qube {
a
}

View File

@ -1,16 +1,21 @@
import json
import os
from collections import defaultdict
from typing import Any, Dict
import redis
import requests
import yaml
from fastapi import FastAPI, Request
from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from tree_traverser import CompressedTree
from fastapi.responses import FileResponse, HTMLResponse
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from frozendict import frozendict
from qubed import Qube
from qubed.tree_formatters import node_tree_to_html
app = FastAPI()
security = HTTPBearer()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
@ -19,45 +24,97 @@ app.add_middleware(
allow_headers=["*"],
)
@app.get('/favicon.ico', include_in_schema=False)
async def favicon():
return FileResponse("favicon.ico")
app.mount("/static", StaticFiles(directory="static"), name="static")
templates = Jinja2Templates(directory="templates")
with open(os.environ.get("CONFIG_DIR", ".") + "/config.yaml", "r") as f:
config = yaml.safe_load(f)
qubes: dict[str, Qube] = {}
# print("Getting climate and extremes dt data from github")
# try:
# qubes["climate-dt"] = Qube.from_json(
# requests.get(
# "https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json",
# timeout=3).json()
# )
# qubes["extremes-dt"] = Qube.from_json(
# requests.get(
# "https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/extremes_dt.json",
# timeout=3).json()
# )
# mars_language = yaml.safe_load(
# requests.get(
# "https://github.com/ecmwf/qubed/raw/refs/heads/main/config/climate-dt/language.yaml",
# timeout=3).content
# )
# except:
qubes["climate-dt"] = Qube.empty()
qubes["extremes-dt"] = Qube.empty()
mars_language = {}
if "local_cache" in config:
print("Getting cache from local file")
with open(config["local_cache"], "r") as f:
json_data = f.read()
print("Found compressed catalog in local file")
if "LOCAL_CACHE" in os.environ:
print("Getting climate and extremes dt data from local files")
with open("../tests/example_qubes/climate_dt.json") as f:
qubes["climate-dt"] = Qube.from_json(json.load(f))
with open("../tests/example_qubes/extremes_dt.json") as f:
qubes["climate-dt"] = qubes["climate-dt"] | Qube.from_json(json.load(f))
with open("../tests/example_qubes/od.json") as f:
qubes["climate-dt"] = qubes["climate-dt"] | Qube.from_json(json.load(f))
with open("../config/language/language.yaml", "r") as f:
mars_language = yaml.safe_load(f)["_field"]
with open("../config/language/paramids.yaml", "r") as f:
params = yaml.safe_load(f)
else:
print("Getting cache from redis")
r = redis.Redis(host=os.environ.get("REDIS_HOST", "localhost"), port=6379, db=0)
json_data = r.get('compressed_catalog')
print("Getting climate and extremes dt data from github")
qubes["climate-dt"] = Qube.from_json(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json",
timeout=1,
).json()
)
qubes["extremes-dt"] = Qube.from_json(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/extremes_dt.json",
timeout=1,
).json()
)
print("Loading tree to json")
if not json_data:
c_tree = CompressedTree.from_json({})
qubes["od"] = Qube.from_json(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/od.json",
timeout=1,
).json()
)
qubes["climate-dt"] = qubes["climate-dt"] | qubes["extremes-dt"] | qubes["od"]
mars_language = yaml.safe_load(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/config/climate-dt/language.yaml",
timeout=3,
).content
)["_field"]
if "API_KEY" in os.environ:
api_key = os.environ["API_KEY"]
else:
compressed_tree_json = json.loads(json_data)
c_tree = CompressedTree.from_json(compressed_tree_json)
print("Partialy decompressing tree, shoud be able to skip this step in future.")
tree = c_tree.reconstruct_compressed_ecmwf_style()
with open("api_key.secret", "r") as f:
api_key = f.read()
print("Ready to serve requests!")
base = os.environ.get("CONFIG_DIR", ".")
config = {
"fdb_schema": f"{base}/schema",
"mars_language": f"{base}/language.yaml",
}
with open(config["mars_language"], "r") as f:
mars_language = yaml.safe_load(f)["_field"]
def validate_key(key: str):
if key not in qubes:
raise HTTPException(status_code=404, detail=f"Qube {key} not found")
return key
def request_to_dict(request: Request) -> Dict[str, Any]:
async def get_body_json(request: Request):
return await request.json()
def parse_request(request: Request) -> dict[str, str | list[str]]:
# Convert query parameters to dictionary format
request_dict = dict(request.query_params)
for key, value in request_dict.items():
@ -67,167 +124,253 @@ def request_to_dict(request: Request) -> Dict[str, Any]:
return request_dict
def match_against_cache(request, tree):
if not tree: return {"_END_" : {}}
matches = {}
for k, subtree in tree.items():
if len(k.split("=")) != 2:
raise ValueError(f"Key {k} is not in the correct format")
key, values = k.split("=")
values = set(values.split(","))
if key in request:
if isinstance(request[key], list):
matching_values = ",".join(request_value for request_value in request[key] if request_value in values)
if matching_values:
matches[f"{key}={matching_values}"] = match_against_cache(request, subtree)
elif request[key] in values:
matches[f"{key}={request[key]}"] = match_against_cache(request, subtree)
if not matches: return {k : {} for k in tree.keys()}
return matches
def validate_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
if credentials.credentials != api_key:
raise HTTPException(status_code=403, detail="Incorrect API Key")
return credentials
def max_tree_depth(tree):
"Figure out the maximum depth of a tree"
if not tree:
return 0
return 1 + max(max_tree_depth(v) for v in tree.values())
def prune_short_branches(tree, depth = None):
if depth is None:
depth = max_tree_depth(tree)
return {k : prune_short_branches(v, depth-1) for k, v in tree.items() if max_tree_depth(v) == depth-1}
def get_paths_to_leaves(tree):
for k,v in tree.items():
if not v:
yield [k,]
else:
for leaf in get_paths_to_leaves(v):
yield [k,] + leaf
def get_leaves(tree):
for k,v in tree.items():
if not v:
yield k
else:
for leaf in get_leaves(v):
yield leaf
@app.get("/match")
async def get_match(request: Request):
# Convert query parameters to dictionary format
request_dict = request_to_dict(request)
# Run the schema matching logic
match_tree = match_against_cache(request_dict, tree)
@app.get("/favicon.ico", include_in_schema=False)
async def favicon():
return FileResponse("favicon.ico")
# Prune the tree to only include branches that are as deep as the deepest match
# This means if you don't choose a certain branch at some point
# the UI won't keep nagging you to choose a value for that branch
match_tree = prune_short_branches(match_tree)
@app.get("/", response_class=HTMLResponse)
async def read_root(request: Request):
return templates.TemplateResponse(
"index.html",
{
"request": request,
"config": {
"message": "Hello from the dev server!",
},
"api_url": os.environ.get("API_URL", "/api/v1/"),
},
)
return match_tree
@app.get("/paths")
async def api_paths(request: Request):
request_dict = request_to_dict(request)
match_tree = match_against_cache(request_dict, tree)
match_tree = prune_short_branches(match_tree)
paths = get_paths_to_leaves(match_tree)
@app.get("/api/v1/keys/")
async def keys():
return list(qubes.keys())
# deduplicate leaves based on the key
by_path = defaultdict(lambda : {"paths" : set(), "values" : set()})
for p in paths:
if p[-1] == "_END_": continue
key, values = p[-1].split("=")
values = values.split(",")
path = tuple(p[:-1])
by_path[key]["values"].update(values)
by_path[key]["paths"].add(tuple(path))
@app.get("/api/v1/get/{key}/")
async def get(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
return qubes[key].to_json()
return [{
@app.post("/api/v1/union/{key}/")
async def union(
key: str,
credentials: HTTPAuthorizationCredentials = Depends(validate_api_key),
body_json=Depends(get_body_json),
):
if key not in qubes:
qubes[key] = Qube.empty()
q = Qube.from_json(body_json)
qubes[key] = qubes[key] | q
return qubes[key].to_json()
def follow_query(request: dict[str, str | list[str]], qube: Qube):
s = qube.select(request, mode="next_level", consume=False)
by_path = defaultdict(lambda: {"paths": set(), "values": set()})
for request, node in s.leaf_nodes():
if not node.metadata.get("is_leaf", True):
by_path[node.key]["values"].update(node.values.values)
by_path[node.key]["paths"].add(frozendict(request))
return s, [
{
"paths": list(v["paths"]),
"key": key,
"values": sorted(v["values"], reverse=True),
} for key, v in by_path.items()]
}
for key, v in by_path.items()
]
@app.get("/stac")
async def get_STAC(request: Request):
request_dict = request_to_dict(request)
paths = await api_paths(request)
@app.get("/api/v1/select/{key}/")
async def select(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
q = qubes[key].select(request)
return q.to_json()
@app.get("/api/v1/query/{key}")
async def query(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
qube, paths = follow_query(request, qubes[key])
return paths
@app.get("/api/v1/basicstac/{key}/{filters:path}")
async def basic_stac(filters: str, key: str = Depends(validate_key)):
pairs = filters.strip("/").split("/")
request = dict(p.split("=") for p in pairs if "=" in p)
qube, _ = follow_query(request, qubes[key])
def make_link(child_request):
"""Take a MARS Key and information about which paths matched up to this point and use it to make a STAC Link"""
kvs = [f"{key}={value}" for key, value in child_request.items()]
href = f"/api/v1/basicstac/{key}/{'/'.join(kvs)}"
last_key, last_value = list(child_request.items())[-1]
return {
"title": f"{last_key}={last_value}",
"href": href,
"rel": "child",
"type": "application/json",
}
# Format the response as a STAC collection
(this_key, this_value), *_ = (
list(request.items())[-1] if request else ("root", "root"),
None,
)
key_info = mars_language.get(this_key, {})
try:
values_info = dict(key_info.get("values", {}))
value_info = values_info.get(
this_value, f"No info found for value `{this_value}` found."
)
except ValueError:
value_info = f"No info found for value `{this_value}` found."
if this_key == "root":
value_info = "The root node"
# key_desc = key_info.get(
# "description", f"No description for `key` {this_key} found."
# )
print(this_key, this_value)
print(this_key, key_info)
stac_collection = {
"type": "Catalog",
"stac_version": "1.0.0",
"id": "root"
if not request
else "/".join(f"{k}={v}" for k, v in request.items()),
"title": f"{this_key}={this_value}",
"description": value_info,
"links": [make_link(leaf) for leaf in qube.leaves()],
# "debug": {
# "qube": str(qube),
# },
}
return stac_collection
@app.get("/api/v1/stac/{key}/")
async def get_STAC(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
qube, paths = follow_query(request, qubes[key])
kvs = [
f"{k}={','.join(v)}" if isinstance(v, list) else f"{k}={v}"
for k, v in request.items()
]
request_params = "&".join(kvs)
def make_link(key_name, paths, values):
"""Take a MARS Key and information about which paths matched up to this point and use it to make a STAC Link"""
path = paths[0]
href_template = f"/stac?{'&'.join(path)}{'&' if path else ''}{key_name}={{}}"
optional = [False]
optional_str = "Yes" if all(optional) and len(optional) > 0 else ("Sometimes" if any(optional) else "No")
values_from_mars_language = mars_language.get(key_name, {}).get("values", [])
# values = [v[0] if isinstance(v, list) else v for v in values_from_mars_language]
if all(isinstance(v, list) for v in values_from_mars_language):
value_descriptions_dict = {k : v[-1]
for v in values_from_mars_language
if len(v) > 1
for k in v[:-1]}
value_descriptions = [value_descriptions_dict.get(v, "") for v in values]
if not any(value_descriptions): value_descriptions = None
href_template = f"/stac?{request_params}{'&' if request_params else ''}{key_name}={{{key_name}}}"
print(f"{key_name = }")
if key_name == "param":
print(params)
values_from_mars_language = params
value_descriptions = [
max(params.get(int(v), [""]), key=len) for v in values
]
print(value_descriptions)
else:
values_from_mars_language = mars_language.get(key_name, {}).get(
"values", []
)
if all(isinstance(v, list) for v in values_from_mars_language):
value_descriptions_dict = {
k: v[-1]
for v in values_from_mars_language
if len(v) > 1
for k in v[:-1]
}
value_descriptions = [
value_descriptions_dict.get(v, "") for v in values
]
if not any(value_descriptions):
value_descriptions = None
return {
"title": key_name,
"generalized_datacube:href_template": href_template,
"rel": "child",
"type": "application/json",
"generalized_datacube:dimension" : {
"type" : mars_language.get(key_name, {}).get("type", ""),
"description": mars_language.get(key_name, {}).get("description", ""),
"values" : values,
"value_descriptions" : value_descriptions,
"optional" : any(optional),
"multiple": True,
"paths" : paths,
"title": key_name,
"uriTemplate": href_template,
"rel": "child",
"type": "application/json",
"variables": {
key_name: {
"type": "string",
"description": mars_language.get(key_name, {}).get(
"description", ""
),
"enum": values,
"value_descriptions": value_descriptions,
# "paths": paths,
}
}
},
}
def value_descriptions(key, values):
return {
v[0] : v[-1] for v in mars_language.get(key, {}).get("values", [])
v[0]: v[-1]
for v in mars_language.get(key, {}).get("values", [])
if len(v) > 1 and v[0] in list(values)
}
descriptions = {
key : {
"key" : key,
"values" : values,
"description" : mars_language.get(key, {}).get("description", ""),
"value_descriptions" : value_descriptions(key,values),
key: {
"key": key,
"values": values,
"description": mars_language.get(key, {}).get("description", ""),
"value_descriptions": value_descriptions(key, values),
}
for key, values in request_dict.items()
for key, values in request.items()
}
# Format the response as a STAC collection
stac_collection = {
"type": "Collection",
"type": "Catalog",
"stac_version": "1.0.0",
"id": "partial-matches",
"id": "root" if not request else "/stac?" + request_params,
"description": "STAC collection representing potential children of this request",
"links": [
make_link(p["key"], p["paths"], p["values"])
for p in paths
],
"links": [make_link(p["key"], p["paths"], p["values"]) for p in paths],
"debug": {
"request": request_dict,
# "request": request,
"descriptions": descriptions,
"paths" : paths,
}
# "paths": paths,
"qube": node_tree_to_html(
qube.compress(),
collapse=True,
depth=10,
include_css=False,
include_js=False,
max_summary_length=200,
css_id="qube",
),
},
}
return stac_collection
return stac_collection

View File

@ -1,3 +1,5 @@
fastapi[standard]
pe
redis
redis
frozendict
requests

View File

@ -1,3 +1,3 @@
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
cd "$parent_path"
CONFIG_DIR=../config/local fastapi dev ./main.py --port 8124 --reload
LOCAL_CACHE=True fastapi dev ./main.py --port 8124 --reload

3
stac_server/run_prod.sh Executable file
View File

@ -0,0 +1,3 @@
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
cd "$parent_path"
sudo LOCAL_CACHE=True ../../.venv/bin/fastapi dev ./main.py --port 80 --host=0.0.0.0 --reload

View File

@ -5,8 +5,14 @@ function getSTACUrlFromQuery() {
const params = new URLSearchParams(window.location.search);
// get current window url and remove path part
let api_url = new URL(window.location.href);
api_url.pathname = "/stac";
if (window.API_URL.startsWith("http")) {
// Absolute URL: Use it directly
api_url = new URL(window.API_URL);
} else {
// Relative URL: Combine with the current window's location
api_url = new URL(window.location.href);
api_url.pathname = window.API_URL;
}
for (const [key, value] of params.entries()) {
api_url.searchParams.set(key, value);
@ -132,39 +138,23 @@ async function createCatalogItem(link, itemsContainer) {
// Update the item div with real content
itemDiv.classList.remove("loading");
const dimension = link["generalized_datacube:dimension"];
const variables = link["variables"];
const key = Object.keys(variables)[0];
const variable = variables[key];
// add data-key attribute to the itemDiv
itemDiv.dataset.key = link.title;
itemDiv.dataset.keyType = dimension.type;
itemDiv.dataset.keyType = variable.type;
itemDiv.innerHTML = `
<h3 class="item-title">${link.title || "No title available"}</h3>
<p class="item-type">Key Type: ${itemDiv.dataset.keyType || "Unknown"}</p>
<!-- <p class="item-type">Paths: ${dimension.paths}</p> -->
<p class="item-type">Optional: ${dimension.optional ? "Yes" : "No"}</p>
<p class="item-description">${
dimension.description
? dimension.description.slice(0, 100)
: "No description available"
}...</p>
variable.description ? variable.description.slice(0, 100) : ""
}</p>
`;
// if (dimension.type === "date" || dimension.type === "time") {
// // Render a date picker for the "date" key
// const picker = `<input type="${link.title}" name="${link.title}">`;
// //convert picker to HTML node
// const pickerNode = document
// .createRange()
// .createContextualFragment(picker);
// itemDiv.appendChild(pickerNode);
// }
// Otherwise create a scrollable list with checkboxes for values if available
if (
// dimension.type === "enum" &&
dimension.values &&
dimension.values.length > 0
) {
if (variable.enum && variable.enum.length > 0) {
const listContainer = renderCheckboxList(link);
itemDiv.appendChild(listContainer);
} else {
@ -179,14 +169,15 @@ async function createCatalogItem(link, itemsContainer) {
}
function renderCheckboxList(link) {
const dimension = link["generalized_datacube:dimension"];
const value_descriptions = dimension.value_descriptions || [];
const variables = link["variables"];
const key = Object.keys(variables)[0];
const variable = variables[key];
const value_descriptions = variable.value_descriptions || [];
const listContainerHTML = `
<div class="item-list-container">
<label class="list-label">Select one or more values:</label>
<div class="scrollable-list">
${dimension.values
${variable.enum
.map((value, index) => {
const labelText = value_descriptions[index]
? `${value} - ${value_descriptions[index]}`
@ -195,7 +186,7 @@ function renderCheckboxList(link) {
<div class="checkbox-container">
<label class="checkbox-label">
<input type="checkbox" class="item-checkbox" value="${value}" ${
dimension.values.length === 1 ? "checked" : ""
variable.enum.length === 1 ? "checked" : ""
}>
${labelText}
</label>
@ -262,8 +253,10 @@ function renderRawSTACResponse(catalog) {
itemDetails.textContent = JSON.stringify(just_stac, null, 2);
const debug_container = document.getElementById("debug");
// create new object without debug key
debug_container.textContent = JSON.stringify(catalog.debug, null, 2);
const qube_container = document.getElementById("qube");
qube_container.innerHTML = catalog.debug.qube;
}
// Fetch STAC catalog and display items
@ -287,6 +280,7 @@ async function fetchCatalog(request, stacUrl) {
// Highlight the request and raw STAC
hljs.highlightElement(document.getElementById("raw-stac"));
hljs.highlightElement(document.getElementById("debug"));
hljs.highlightElement(document.getElementById("example-python"));
} catch (error) {
console.error("Error fetching STAC catalog:", error);
}

View File

@ -0,0 +1,50 @@
pre#qube {
font-family: monospace;
white-space: pre;
font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;
font-size: 12px;
line-height: 1.4;
details {
margin-left: 0;
}
.qubed-level a {
margin-left: 10px;
text-decoration: none;
}
summary {
list-style: none;
cursor: pointer;
text-overflow: ellipsis;
overflow: hidden;
text-wrap: nowrap;
display: block;
}
span.qubed-node:hover {
background-color: #f0f0f0;
}
details > summary::after {
content: ' ▲';
}
details:not([open]) > summary::after {
content: " ▼";
}
.qubed-level {
text-overflow: ellipsis;
overflow: hidden;
text-wrap: nowrap;
display: block;
}
summary::-webkit-details-marker {
display: none;
content: "";
}
}

View File

@ -2,6 +2,9 @@ html,
body {
min-height: 100vh;
height: 100%;
--accent-color: #003399;
--background-grey: #f4f4f4;
}
body {
@ -23,7 +26,7 @@ body {
width: 30%;
padding: 10px;
overflow-y: scroll;
background-color: #f4f4f4;
background-color: var(--background-grey);
border-right: 1px solid #ddd;
}
@ -45,7 +48,9 @@ body {
}
.sidebar-header button {
width: 10em;
width: 7em;
height: 2em;
padding: 0;
}
canvas {
@ -63,6 +68,7 @@ canvas {
margin-bottom: 10px;
border-radius: 5px;
transition: background-color 0.2s ease;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
}
.item-title {
@ -91,10 +97,8 @@ canvas {
}
.item.selected {
background-color: #d4e9ff;
/* Lighter blue for selection */
border-color: #003399;
/* Keep the original ECMWF blue for the border */
background-color: var(--background-grey);
border-color: var(--accent-color);
}
summary h2 {
@ -117,7 +121,7 @@ button {
/* Padding around button text */
margin: 0 5px;
/* Margin between buttons */
background-color: #003399;
background-color: var(--accent-color);
/* ECMWF blue */
color: white;
/* White text color */
@ -138,7 +142,6 @@ button:hover {
.item-list-container {
margin-top: 20px;
margin-bottom: 20px;
}
.scrollable-list {
@ -148,7 +151,6 @@ button:hover {
border: 1px solid #ccc;
border-radius: 4px;
background-color: #fff;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
}
.checkbox-container {
@ -168,14 +170,14 @@ button:hover {
}
.checkbox-container:hover .checkbox-label {
color: #003399;
color: var(--accent-color);
}
.list-label {
font-weight: bold;
margin-bottom: 0.5em;
display: block;
color: #003399;
color: var(--accent-color);
}
span.key,
@ -208,4 +210,8 @@ span.value:hover {
#details {
width: 100%;
}
}
}
details h2 {
font-size: medium;
}

View File

@ -5,6 +5,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ECMWF DestinE STAC Viewer</title>
<link rel="stylesheet" href="/static/styles.css" />
<link rel="stylesheet" href="/static/qube_styles.css" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github.min.css">
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/json.min.js"></script>
@ -24,24 +25,41 @@
<a id="stac-anchor"><button id="stac-btn">Raw STAC</button></a>
<button id="next-btn">Next</button>
</div>
<div id="items">
<!-- Items from the STAC catalog will be rendered here -->
</div>
</div>
<div id="details">
<h2>Current Request</h2>
Hover over a key or value for more info.
<h2>Current Selection</h2>
This is a <a href="https://github.com/ecmwf/datacube-spec/blob/main/spec/selection.md">MARS Selection</a> object in JSON format. Hover over a key or value for more info.
<!-- Container for the request part, preloaded to prevent layout shift. -->
<pre><code id="request-breakdown" class="language-json">
{
}
</code></pre>
<!-- Container to show the current tree -->
<h2>Currently Selected Tree</h2></summary>
<p>This shows the data <a href="https://qubed.readthedocs.io/en/latest/quickstart.html">qube</a> that matches with the current query. The leaves are the next set if available selections you can make. </p>
<pre id = "qube"></pre>
<details>
<summary><h2>Example Qubed Code</h2></summary>
See the <a href="https://qubed.readthedocs.io/en/latest/">Qubed documentation</a> for more details.
<pre><code id="example-python" class="language-python">
# pip install qubed requests
import requests
from qubed import Qube
qube = Qube.from_json(requests.get("{{ api_url }}select/climate-dt/?{{request.url.query}}").json())
qube.print()
</code></pre>
</details>
<!-- Container fo the raw STAC response -->
<details open>
<details>
<summary><h2>Raw STAC Response</h2></summary>
<p>See the <a href="https://github.com/ecmwf-projects/catalogs/blob/main/structured_stac.md">extension proposal</a> for more details on the format.</p>
<p>See the <a href="https://github.com/ecmwf-projects/catalogs/blob/main/structured_stac.md">STAC Extension Proposal</a> for more details on the format.</p>
<pre class="json-pre"><code id="raw-stac" class="language-json"></code></pre>
</details>
@ -53,6 +71,9 @@
</div>
</div>
<script>
window.API_URL = "{{ api_url }}stac/climate-dt/";
</script>
<script src="/static/app.js"></script>
</body>
</html>
</html>

View File

@ -1,7 +1,5 @@
# STAC Generalized Datacubes Extension
# Template Extension Specification
- **Title:** Generalized Datacubes
- **Identifier:** <https://stac-extensions.github.io/template/v1.0.0/schema.json>
- **Field Name Prefix:** generalized_datacube
@ -9,18 +7,39 @@
- **Extension [Maturity Classification](https://github.com/radiantearth/stac-spec/tree/master/extensions/README.md#extension-maturity):** Proposal
- **Owner**: @TomHodson
This STAC extension allows for represention of [generalised datacubes][gen_datacubes].
This STAC extension borrows the [Draft OGC Records API](https://docs.ogc.org/DRAFTS/20-004.html), specifically the [templated links section](https://docs.ogc.org/DRAFTS/20-004.html#sc_templated_links_with_variables) to give STAC the ability to index very large datasets that conform to a generalised datacube model.
A datacube has a fixed set of dimensions `[a, b, c..]` , each of which have a fixed span `{a: ["temp","rainfall"], b : [1-7], c:[True, False]}` such that we can access data by indexing, i.e providing a value for each axis, `a="rainfall", b=1, ...`. A generalised datacubes allow the dimensions to change during indexing, so choosing `a="rainfall"` might yield a different set of axes from `a="temp"`.
A typical datacube has a fixed set of dimensions `[a, b, c..]` , each of which have a fixed span `{a: ["temp","rainfall"], b : [1-7], c:[True, False]}` such that we can access data by indexing, i.e providing a value for each axis, `a="rainfall", b=1, ...`. A generalized datacube, by our defintion, allow the dimensions to change during indexing, so choosing `a="rainfall"` might yield a different set of axes from `a="temp"`.
The [STAC Datacube][datacube_extension] extension serves the needs of datacubes that appear in STAC as Items or Collections, i.e as leaves in the tree. This extension instead focussing on allowing STAC to serve as an interface to dynamically explore the branches of generalised datacubes. It does this by adding additional metadata to the children of Catalog entries.
The [STAC Datacube][datacube_extension] extension serves the needs of datacubes that appear in STAC as Items or Collections, i.e as leaves in the tree. This extension instead focussing on allowing STAC to serve as an interface to dynamically explore the branches of generalised datacubes. It does this by adding additional metadata from the OGC Records standard to the children of Catalog entries.
We take the *Dimension Objects* defined by the [Datacube Extension][datacube_extension] and add them to [Link objects][link_objects] under the key `generalized_datacube:dimension`. This enables a single Link Object to represent a whole axis and its allowed values. Since `href` must now be constructed dynamically, we rempve it and add a `generalized_datacube:href_template` attribute to communicate how to construct the URLs corresponding to particular choice of value or values.
In practice, what this proposal does is:
In order to support more complex slicing operations in which multiple indices may be selected for a given dimensions we also add additional optional attributes to all *Dimension Objects*, these are:
1. For child items that represent many distinct children, replace `"links":` with `"linkTemplates":` in the Catalog entry. (Following the example of the OGC Records API.)
2. For each `rel: Child` object in `linkTemplates`:
* `optional` : bool whether this dimension can be skipped.
* `multiple` : boo wether multiple values can be selected for this key.
a. Add a `variables` key following the OGC Records API whose values is a dictionary with entries like
```json
"format": {
"type": "string",
"enum": [
"application/vnd.google-earth.kml+xml",
"application/vnd.google-earth.kmz",
"image/png",
"image/jpeg",
"image/gif",
"image/png; mode=8bit",
"application/x-pdf",
"image/svg+xml",
"image/tiff"
]
}
```
b. Add a "uriTemplate" key that specifies how to contruct the resulting URL: i.e `http://hostname.tld/app/index.html?class=od&format={format}`
This enables a child object to represent a whole axis and its allowed values. Since `href` must now be constructed dynamically, we rempve it and add a `generalized_datacube:href_template` attribute to communicate how to construct the URLs corresponding to particular choice of value or values.
[gen_datacubes]: https://github.com/ecmwf/datacube-spec
[link_objects]: https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#link-object
@ -36,22 +55,23 @@ A typical `Catalog` entry with this extension:
"id": "rainfall",
"stac_version": "1.0.0",
"description": "ECMWF's Operational Data Archive",
"links": [
"linkTemplates": [
{
"title": "Expver - Experiment Version",
"generalized_datacube:href_template": "http://136.156.129.226/app/index.html?class=od&expver={}",
"rel": "child",
"title": "Expver - Experiment Version",
"uriTemplate": "http://hostname.tld/app/index.html?class=od&expver={expver}",
"type": "application/json",
"generalized_datacube:dimension" : {
"type" : "enum",
"description": "Experiment version, 0001 selects operational data.",
"values" : ["0001", "xxxx"],
"value_descriptions" : ["Operational Data", "Experimental Data"],
"optional" : false,
"multiple": true,
"variables" : {
"expver" : {
"description": "Experiment version, 0001 selects operational data.",
"type" : "string",
"enum" : ["0001", "xxxx"],
"value_descriptions" : ["Operational Data", "Experimental Data"],
"optional" : false,
}
}
""
},
],
"stac_extensions": [
@ -72,120 +92,19 @@ The fields in the table below can be used in these parts of STAC documents:
- [ ] Assets (for both Collections and Items, incl. Item Asset Definitions in Collections)
- [x] Links
| Field Name | Type | Description |
| -------------------- | ------------------------- | -------------------------------------------------------- |
| axis:dimension | Dimension Object | Whether the axis is an enum, date range, time range etc |
| axis:href_template | string | Whether the axis is an enum, date range, time range etc |
| Field Name | Type | Description |
| -------------------- | ------------------------- | --------------------------------------------------------------------------------------------------------------------- |
| uriTemplate | URI Template | Of the form "http://hostname.tld/app/index.html?class=od&expver={expver}", follows OGC Records Spec for uriTemplates |
| variables | | |
### Additional Field Information
#### axis:dimension
#### uriTemplate
Todo
### Dimension Object
The dimension object reuses all those [defined by the datacube extension](https://github.com/stac-extensions/datacube#dimension-object), currently those are Horizontal Spatial Raster, Vertical Spatial, Temporal Dimension, Spatial Vector Dimension, Additional Dimension. They are reproduced below for reference.
These dimension objects are defined in addition:
### Enum Dimension Object
| Field Name | Type | Description |
| ---------------- | ----------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** `enum`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| values | \[number\|string] | An ordered list of all values, especially useful for [nominal](https://en.wikipedia.org/wiki/Level_of_measurement#Nominal_level) values. |
| value_descriptions | \[string] | Optionally provide a human readable description for each value. Useful if the values are codes that have defined meanings. |
| step | number\|null | If the dimension consists of [interval](https://en.wikipedia.org/wiki/Level_of_measurement#Interval_scale) values, the space between the values. Use `null` for irregularly spaced steps. |
| unit | string | The unit of measurement for the data, preferably compliant to [UDUNITS-2](https://ncics.org/portfolio/other-resources/udunits2/) units (singular). |
| reference_system | string | The reference system for the data. |
An Enum Dimension Object MUST specify `values`.
Dimension objects degined by the datacube extension:
### Horizontal Spatial Raster Dimension Object
A spatial raster dimension in one of the horizontal (x or y) directions.
| Field Name | Type | Description |
| ---------------- | -------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `spatial`. |
| axis | string | **REQUIRED.** Axis of the spatial raster dimension (`x`, `y`). |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[number] | **REQUIRED.** Extent (lower and upper bounds) of the dimension as two-element array. Open intervals with `null` are not allowed. |
| values | \[number] | Optionally, an ordered list of all values. |
| step | number\|null | The space between the values. Use `null` for irregularly spaced steps. |
| reference_system | string\|number\|object | The spatial reference system for the data, specified as [numerical EPSG code](http://www.epsg-registry.org/), [WKT2 (ISO 19162) string](http://docs.opengeospatial.org/is/18-010r7/18-010r7.html) or [PROJJSON object](https://proj.org/specifications/projjson.html). Defaults to EPSG code 4326. |
### Vertical Spatial Dimension Object
A spatial dimension in vertical (z) direction.
| Field Name | Type | Description |
| ---------------- | ---------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `spatial`. |
| axis | string | **REQUIRED.** Axis of the spatial dimension, always `z`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[number\|null\] | If the dimension consists of [ordinal](https://en.wikipedia.org/wiki/Level_of_measurement#Ordinal_scale) values, the extent (lower and upper bounds) of the values as two-element array. Use `null` for open intervals. |
| values | \[number\|string\] | An ordered list of all values, especially useful for [nominal](https://en.wikipedia.org/wiki/Level_of_measurement#Nominal_level) values. |
| step | number\|null | If the dimension consists of [interval](https://en.wikipedia.org/wiki/Level_of_measurement#Interval_scale) values, the space between the values. Use `null` for irregularly spaced steps. |
| unit | string | The unit of measurement for the data, preferably compliant to [UDUNITS-2](https://ncics.org/portfolio/other-resources/udunits2/) units (singular). |
| reference_system | string\|number\|object | The spatial reference system for the data, specified as [numerical EPSG code](http://www.epsg-registry.org/), [WKT2 (ISO 19162) string](http://docs.opengeospatial.org/is/18-010r7/18-010r7.html) or [PROJJSON object](https://proj.org/specifications/projjson.html). Defaults to EPSG code 4326. |
A Vertical Spatial Dimension Object MUST specify an `extent` or `values`. It MAY specify both.
### Temporal Dimension Object
A temporal dimension based on the ISO 8601 standard. The temporal reference system for the data is expected to be ISO 8601 compliant
(Gregorian calendar / UTC). Data not compliant with ISO 8601 can be represented as an *Additional Dimension Object* with `type` set to `temporal`.
| Field Name | Type | Description |
| ---------- | --------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `temporal`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[string\|null] | **REQUIRED.** Extent (lower and upper bounds) of the dimension as two-element array. The dates and/or times must be strings compliant to [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). `null` is allowed for open date ranges. |
| values | \[string] | If the dimension consists of an ordered list of specific values they can be listed here. The dates and/or times must be strings compliant to [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). |
| step | string\|null | The space between the temporal instances as [ISO 8601 duration](https://en.wikipedia.org/wiki/ISO_8601#Durations), e.g. `P1D`. Use `null` for irregularly spaced steps. |
### Spatial Vector Dimension Object
A vector dimension that defines a spatial dimension based on geometries.
| Field Name | Type | Description |
| ---------------- | -------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `geometry`. |
| axes | \[string] | Axes of the vector dimension as an ordered set of `x`, `y` and `z`. Defaults to `x` and `y`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| bbox | \[number] | **REQUIRED.** A single bounding box of the geometries as defined for [STAC Collections](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#spatial-extent-object), but not nested. |
| values | \[string\] | Optionally, a representation of the geometries. This could be a list of WKT strings or other identifiers. |
| geometry_types | \[[GeoJSON Types](https://www.rfc-editor.org/rfc/rfc7946#section-1.4)] | A set of geometry types. If not present, mixed geometry types must be assumed. |
| reference_system | string\|number\|object | The spatial reference system for the data, specified as [numerical EPSG code](http://www.epsg-registry.org/), [WKT2 (ISO 19162) string](http://docs.opengeospatial.org/is/18-010r7/18-010r7.html) or [PROJJSON object](https://proj.org/specifications/projjson.html). Defaults to EPSG code 4326. |
For a general explanation what a vector datacube and a vector dimension is, please read the article "[Vector Data Cubes](https://r-spatial.org/r/2022/09/12/vdc.html)".
### Additional Dimension Object
An additional dimension that is not `spatial`, but may be `temporal` if the data is not compliant with ISO 8601 (see below).
| Field Name | Type | Description |
| ---------------- | ----------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Custom type of the dimension, never `spatial` or `geometry`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[number\|null] | If the dimension consists of [ordinal](https://en.wikipedia.org/wiki/Level_of_measurement#Ordinal_scale) values, the extent (lower and upper bounds) of the values as two-element array. Use `null` for open intervals. |
| values | \[number\|string] | An ordered list of all values, especially useful for [nominal](https://en.wikipedia.org/wiki/Level_of_measurement#Nominal_level) values. |
| step | number\|null | If the dimension consists of [interval](https://en.wikipedia.org/wiki/Level_of_measurement#Interval_scale) values, the space between the values. Use `null` for irregularly spaced steps. |
| unit | string | The unit of measurement for the data, preferably compliant to [UDUNITS-2](https://ncics.org/portfolio/other-resources/udunits2/) units (singular). |
| reference_system | string | The reference system for the data. |
An Additional Dimension Object MUST specify an `extent` or `values`. It MAY specify both.
Note on "Additional Dimension" with type `temporal`:
You can distinguish the "Temporal Dimension" from an "Additional Dimension" by checking whether the extent exists and contains strings.
So if the `type` equals `temporal` and `extent` is an array of strings/null, then you have a "Temporal Dimension",
otherwise you have an "Additional Dimension".
#### variables
Todo

View File

@ -0,0 +1,12 @@
from pathlib import Path
import orjson as json
from tree_traverser.DataCubeTree import CompressedTree
data_path = Path("./config/climate-dt/new_format.json")
with data_path.open("r") as f:
compressed_tree = CompressedTree.from_json(json.loads(f.read()))
compressed_tree = compressed_tree.guess_datatypes()
compressed_tree.print(depth=10)

View File

@ -3,13 +3,15 @@ from pathlib import Path
from tree_traverser import CompressedTree
data_path = Path("data/compressed_tree_climate_dt_ecmwf_style.json")
data_path = Path("./config/climate-dt/compressed_tree.json")
# Print size of file
print(f"climate dt compressed tree: {data_path.stat().st_size // 1e6:.1f} MB")
print("Opening json file")
compressed_tree = CompressedTree.load(data_path)
print(compressed_tree.to_json())
print("Outputting compressed tree ecmwf style")
with open("data/compressed_tree_climate_dt_ecmwf_style.json", "w") as f:
json.dump(compressed_tree.reconstruct_compressed_ecmwf_style(), f)
with open("config/climate-dt/new_format.json", "w") as f:
json.dump(compressed_tree.to_json(), f)

View File

@ -5,27 +5,35 @@ from tree_traverser import CompressedTree, RefcountedDict
class CompressedTreeFixed(CompressedTree):
@classmethod
def from_json(cls, data : dict):
def from_json(cls, data: dict):
c = cls({})
c.cache = {}
ca = data["cache"]
for k, v in ca.items():
g = {k2 : ca[str(v2)]["dict"][k2] if k2 in ca[str(v2)]["dict"] else v2 for k2, v2 in v["dict"].items()}
g = {
k2: ca[str(v2)]["dict"][k2] if k2 in ca[str(v2)]["dict"] else v2
for k2, v2 in v["dict"].items()
}
c.cache[int(k)] = RefcountedDict(g)
c.cache[int(k)].refcount = v["refcount"]
c.root_hash = data["root_hash"]
c.tree = c.cache[c.root_hash]
return c
def reconstruct(self, max_depth=None) -> dict[str, dict]:
"Reconstruct the tree as a normal nested dictionary"
def reconstruct_node(h : int, depth : int) -> dict[str, dict]:
def reconstruct_node(h: int, depth: int) -> dict[str, dict]:
if max_depth is not None and depth > max_depth:
return {}
return {k : reconstruct_node(v, depth=depth+1) for k, v in self.cache[h].items()}
return reconstruct_node(self.root_hash, depth = 0)
return {
k: reconstruct_node(v, depth=depth + 1)
for k, v in self.cache[h].items()
}
return reconstruct_node(self.root_hash, depth=0)
data_path = Path("data/compressed_tree_climate_dt.json")
# Print size of file
@ -39,5 +47,6 @@ output_data_path = Path("data/compressed_tree_climate_dt_ecmwf_style.json")
compressed_tree.save(output_data_path)
print(f"climate dt compressed tree ecmwf style: {output_data_path.stat().st_size // 1e6:.1f} MB")
print(
f"climate dt compressed tree ecmwf style: {output_data_path.stat().st_size // 1e6:.1f} MB"
)

81
test_scripts/rust.py Normal file
View File

@ -0,0 +1,81 @@
from __future__ import annotations
from datetime import datetime
from typing import Sequence
from qubed.rust import Qube as rsQube
# q = pyQube.from_tree("""
# root, class=d1
# ├── dataset=another-value, generation=1/2/3
# └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """)
# json_str = json.dumps(q.to_json())
# rust_qube = Qube.from_json(json_str)
# # print(repr(rust_qube))
# # print(json_str)
# expected = """root, class=d1
# ├── dataset=another-value, generation=1/2/3
# └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """
# assert repr(rust_qube) == expected
# # print(rs_qube._repr_html_())
# print(q | q)
value = str | int | float | datetime
class Qube(rsQube):
@classmethod
def empty(cls):
q = cls()
print(f"empty called {cls = } {q = }")
return q
@classmethod
def from_datacube(cls, datacube: dict[str, value | Sequence[value]]) -> Qube:
qube = cls.empty()
(key, values), *key_vals = list(datacube.items())
node = qube.add_node(qube.root, key, values)
for key, values in key_vals:
node = qube.add_node(parent=node, key=key, values=values)
return qube
@classmethod
def from_dict(cls, d: dict) -> Qube:
q = cls.empty()
def from_dict(parent, d: dict):
for k, children in d.items():
key, values = k.split("=")
values = values.split("/")
node = q.add_node(
parent=parent,
key=key,
values=values,
)
from_dict(parent=node, d=children)
from_dict(q.root, d)
return q
q = Qube.from_datacube({"a": ["4"], "b": "test", "c": ["1", "2", "3"]})
print(q)
print(repr(q))
q = Qube.from_dict(
{
"a=2/3": {"b=1": {}},
"a2=a/b": {"b2=1/2": {}},
}
)
print(q)
print(repr(q))

View File

@ -5,15 +5,15 @@ from tqdm import tqdm
from pathlib import Path
import json
from more_itertools import chunked
process = psutil.Process()
def massage_request(r):
return {k : v if isinstance(v, list) else [v]
for k, v in r.items()}
return {k: v if isinstance(v, list) else [v] for k, v in r.items()}
if __name__ == "__main__":
config = """
---
type: remote
@ -24,18 +24,18 @@ store: remote
"""
request = {
"class": "d1",
"dataset": "climate-dt",
# "date": "19920420",
}
"class": "d1",
"dataset": "climate-dt",
# "date": "19920420",
}
data_path = Path("data/compressed_tree_climate_dt.json")
if not data_path.exists():
compressed_tree = CompressedTree({})
else:
compressed_tree = CompressedTree.load(data_path)
fdb = backend.PyFDB(fdb_config = config)
fdb = backend.PyFDB(fdb_config=config)
visited_path = Path("data/visited_dates.json")
if not visited_path.exists():
@ -46,22 +46,24 @@ store: remote
today = datetime.datetime.today()
start = datetime.datetime.strptime("19920420", "%Y%m%d")
date_list = [start + datetime.timedelta(days=x) for x in range((today - start).days)]
date_list = [d.strftime("%Y%m%d") for d in date_list if d not in visited_dates]
date_list = [
start + datetime.timedelta(days=x) for x in range((today - start).days)
]
date_list = [d.strftime("%Y%m%d") for d in date_list if d not in visited_dates]
for dates in chunked(tqdm(date_list), 5):
print(dates[0])
print(f"Memory usage: {(process.memory_info().rss)/1e6:.1f} MB")
print(f"Memory usage: {(process.memory_info().rss) / 1e6:.1f} MB")
r = request | dict(date = dates)
r = request | dict(date=dates)
tree = fdb.traverse_fdb(massage_request(r))
compressed_tree.insert_tree(tree)
compressed_tree.save(data_path)
for date in dates:
visited_dates.add(date)
with open(visited_path, "w") as f:
json.dump(list(visited_dates), f)
# print(compressed_tree.reconstruct_compressed_ecmwf_style())

View File

@ -0,0 +1,99 @@
# Example script for ingesting data from an fdb into a qube
# Notes
# Uses fdb --compact
# Splits by data in order to avoid out of memory problems with fdb --compact
# Does a bit of processing like removing "year" and "month" keys
# Might want to add datatypes and reordering of keys there too
import json
import subprocess
from datetime import datetime, timedelta
from time import time
import psutil
from qubed import Qube
from tqdm import tqdm
import requests
process = psutil.Process()
CHUNK_SIZE = timedelta(days=60)
FILEPATH = "tests/example_qubes/full_dt.json"
API = "https://qubed.lumi.apps.dte.destination-earth.eu/api/v1"
with open("config/api.secret", "r") as f:
secret = f.read()
def ecmwf_date(d):
return d.strftime("%Y%m%d")
start_date = datetime.now() - timedelta(days=120)
# start_date = datetime(1990, 1, 1)
# end_date = datetime.now()
end_date = datetime(2026, 1, 1)
current_span = [end_date - CHUNK_SIZE, end_date]
try:
qube = Qube.load(FILEPATH)
except:
qube = Qube.empty()
while current_span[0] > start_date:
for config in ["config/config-climate-dt.yaml", "config/config-extremes-dt.yaml"]:
t0 = time()
start, end = map(ecmwf_date, current_span)
print(f"Doing {config} {current_span[0].date()} - {current_span[1].date()}")
print(f"Current memory usage: {process.memory_info().rss / 1e9:.2g}GB")
print(f"{qube.n_nodes = }, {qube.n_leaves = },")
subqube = Qube.empty()
command = [
f"fdb list --compact --config {config} --minimum-keys=date class=d1,date={start}/{end}"
]
try:
p = subprocess.run(
command,
text=True,
shell=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
check=True,
)
except Exception as e:
print(f"Failed for {current_span} {e}")
continue
print("Got compact list")
for i, line in tqdm(enumerate(list(p.stdout.split("\n")))):
if not line.startswith("retrieve,class="):
continue
def split(t):
return t[0], t[1].split("/")
# Could do datatypes here
request = dict(split(v.split("=")) for v in line.strip().split(",")[1:])
request.pop("year", None)
request.pop("month", None)
# Could do things like date = year + month + day
q = Qube.from_datacube(request)
subqube = subqube | q
print("added to qube")
qube = qube | subqube
subqube.print(depth=2)
print(f"{subqube.n_nodes = }, {subqube.n_leaves = },")
requests.post(
API + "/union/climate-dt/",
headers = {"Authorization" : f"Bearer {secret}"},
json = subqube.to_json())
current_span = [current_span[0] - CHUNK_SIZE, current_span[0]]
print(
f"Did that taking {(time() - t0) / CHUNK_SIZE.days:2g} seconds per day ingested, total {(time() - t0):2g}s"
)
with open(FILEPATH, "w") as f:
json.dump(qube.to_json(), f)

Binary file not shown.

Binary file not shown.

BIN
tests/data/mars_list.gz Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,160 @@
from qubed import Qube
q = Qube.from_tree("""
root
class=od
expver=0001
param=1
param=2
expver=0002
param=1
param=2
class=rd
expver=0001
param=1
param=2
param=3
expver=0002
param=1
param=2
""")
def test_getitem():
assert q["class", "od"] == Qube.from_tree("""
root
expver=0001
param=1
param=2
expver=0002
param=1
param=2
""")
assert q["class", "od"]["expver", "0001"] == Qube.from_tree("""
root
param=1
param=2""")
def test_n_leaves():
q = Qube.from_dict(
{"a=1/2/3": {"b=1/2/3": {"c=1/2/3": {}}}, "a=5": {"b=4": {"c=4": {}}}}
)
# Size is 3*3*3 + 1*1*1 = 27 + 1
assert q.n_leaves == 27 + 1
def test_n_leaves_empty():
assert Qube.empty().n_leaves == 0
def test_n_nodes_empty():
assert Qube.empty().n_nodes == 0
def test_union():
q = Qube.from_dict(
{
"a=1/2/3": {"b=1": {}},
}
)
r = Qube.from_dict(
{
"a=2/3/4": {"b=2": {}},
}
)
u = Qube.from_dict(
{
"a=4": {"b=2": {}},
"a=1": {"b=1": {}},
"a=2/3": {"b=1/2": {}},
}
)
assert q | r == u
def test_union_with_empty():
q = Qube.from_dict(
{
"a=1/2/3": {"b=1": {}},
}
)
assert q | Qube.empty() == q
def test_union_2():
q = Qube.from_datacube(
{
"class": "d1",
"dataset": ["climate-dt", "another-value"],
"generation": ["1", "2", "3"],
}
)
r = Qube.from_datacube(
{
"class": "d1",
"dataset": ["weather-dt", "climate-dt"],
"generation": ["1", "2", "3", "4"],
}
)
u = Qube.from_dict(
{
"class=d1": {
"dataset=climate-dt/weather-dt": {
"generation=1/2/3/4": {},
},
"dataset=another-value": {
"generation=1/2/3": {},
},
}
}
)
assert q | r == u
def test_difference():
q = Qube.from_dict(
{
"a=1/2/3/5": {"b=1": {}},
}
)
r = Qube.from_dict(
{
"a=2/3/4": {"b=1": {}},
}
)
i = Qube.from_dict(
{
"a=1/5": {"b=1": {}},
}
)
assert q - r == i
def test_order_independence():
u = Qube.from_dict(
{
"a=4": {"b=2": {}},
"a=1": {"b=2": {}, "b=1": {}},
"a=2/3": {"b=1/2": {}},
}
)
v = Qube.from_dict(
{
"a=2/3": {"b=1/2": {}},
"a=4": {"b=2": {}},
"a=1": {"b=1": {}, "b=2": {}},
}
)
assert u == v

76
tests/test_compression.py Normal file
View File

@ -0,0 +1,76 @@
from qubed import Qube
def test_smoke():
q = Qube.from_dict(
{
"class=od": {
"expver=0001": {"param=1": {}, "param=2": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
},
"class=rd": {
"expver=0001": {"param=1": {}, "param=2": {}, "param=3": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
},
}
)
ct = Qube.from_tree("""
root
class=od, expver=0001/0002, param=1/2
class=rd
expver=0001, param=1/2/3
expver=0002, param=1/2
""")
assert q.compress() == ct
def test_2():
qube = Qube.from_dict(
{
"class=d1": {
"generation=1": {
"date=20240728": {"time=0600": {"param=8/78/79": {}}},
"date=20240828": {"time=0600": {"param=8/78/79": {}}},
"date=20240928": {"time=0600": {"param=8/78/79": {}}},
}
}
}
)
target = Qube.from_datacube(
{
"class": "d1",
"generation": "1",
"date": ["20240728", "20240828", "20240928"],
"time": "0600",
"param": ["8", "78", "79"],
}
)
assert qube.compress() == target
def test_removal_compression():
qube = Qube.from_dict(
{
"class=d1": {
"generation=1": {
"month=07": {"date=20240728": {"time=0600": {"param=8/78/79": {}}}},
"month=08": {"date=20240828": {"time=0600": {"param=8/78/79": {}}}},
"month=09": {"date=20240928": {"time=0600": {"param=8/78/79": {}}}},
}
}
}
)
target = Qube.from_datacube(
{
"class": "d1",
"generation": "1",
"date": ["20240728", "20240828", "20240928"],
"time": "0600",
"param": ["8", "78", "79"],
}
)
assert qube.remove_by_key(["month"]) == target

Some files were not shown because too many files have changed in this diff Show More