diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/background.md b/docs/background.md new file mode 100644 index 0000000..dd1cdab --- /dev/null +++ b/docs/background.md @@ -0,0 +1,113 @@ +# WIP +# Datacubes, Trees and Compressed trees + +This first part is essentially a abridged version of the [datacube spec](https://github.com/ecmwf/datacube-spec), see that document for more detail and the canonical source of truth on the matter. + +Qubed is primarily geared towards dealing with datafiles uniquely labeled by sets of key value pairs. We'll call a set of key value pairs that uniquely labels some data an `identifier`. Here's an example: + +```python +{'class': 'd1', + 'dataset': 'climate-dt', + 'generation': '1', + 'date': '20241102', + 'resolution': 'high', + 'time': '0000', +} +``` + +Unfortunately, we have more than one data file. If we are lucky, the set of identifiers that current exists might form a dense datacube that we could represent like this: + +```python +{'class': ['d1', 'd2'], + 'dataset': 'climate-dt', + 'generation': ['1','2','3'], + 'model': 'icon', + 'date': ['20241102','20241103'], + 'resolution': ['high','low'], + 'time': ['0000', '0600', '1200', '1800'], +} +``` + +with the property that any particular choice for a value for any key will correspond to datafile that exists. + +To save space I will also represent this same thing like this: +``` +- class=d1/d2, dataset=climate-dt, generation=1/2/3, model=icon, date=20241102/20241103, resolution=high/low, time=0000/0600/1200/1800 +``` + +Unfortunately, we are not lucky and our datacubes are not always dense. In this case we might instead represent which data exists using a tree: +``` +root +├── class=od +│ ├── expver=0001 +│ │ ├── param=1 +│ │ └── param=2 +│ └── expver=0002 +│ ├── param=1 +│ └── param=2 +└── class=rd + ├── expver=0001 + │ ├── param=1 + │ ├── param=2 + │ └── param=3 + └── expver=0002 + ├── param=1 + └── param=2 +``` + +But it's clear that the above tree contains a lot of redundant information. Many of the subtrees are identical for example. Indeed in practice a lot of our data turns out to be 'nearly dense' in that it contains many dense datacubes within it. + +There are many valid ways one could compress this tree. If we add the restriction that no identical key=value pairs can be adjacent then here is the compressed tree we might get: + +``` +root +├── class=rd +│ ├── expver=0001, param=1/2/3 +│ └── expver=0002, param=1/2 +└── class=od, expver=0001/0002, param=1/2 +``` + +Without the above restriction we could instead have: + +``` +root +├── class=rd +│ ├── expver=0001, param=3 +│ └── expver=0001/0002, param=1/2 +└── class=od, expver=0001/0002, param=1/2 +``` + +but we do not allow this because it would mean we would have to take multiple branches in order to find data with `expver=0001`. + +What we have now is a tree of dense datacubes which represents a single larger sparse datacube in a more compact manner. For want of a better word we'll call it a Qube. + +## API + +Qubed will provide a core compressed tree data structure called a Qube with: + +Methods to convert to and from: + - [x] A human readable representation like those seen above. + - [x] An HTML version where subtrees can be collapsed. + - [ ] An compact protobuf-based binary format + - [x] Nested python dictionaries or JSON + - [/] The output of [fdb list](https://confluence.ecmwf.int/display/FDB/fdb-list) + - [ ] [mars list][mars list] + - [ ] [constraints.json][constraints] + +[constraints]: (https://object-store.os-api.cci2.ecmwf.int/cci2-prod-catalogue/resources/reanalysis-era5-land/constraints_a0ae5b42d67869674e13fba9fd055640bcffc37c24578be1f465d7d5ab2c7ee5.json +[mars list]: https://git.ecmwf.int/projects/CDS/repos/cads-forms-reanalysis/browse/reanalysis-era5-single-levels/gecko-config/mars.list?at=refs%2Fheads%2Fprod + +Useful algorithms: + - [x] Compression + - [/] Union/Intersection/Difference + +Performant Membership Queries + - Identifier membership + - Datacube query (selection) + +Metadata Storage + + + + + diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..c2278ca --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,37 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'qubed' +copyright = '2025, Tom Hodson (ECMWF)' +author = 'Tom Hodson (ECMWF)' +release = '0.1.0' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.autodoc", # for generating documentation from the docstrings in our code + "sphinx.ext.napoleon", # for parsing Numpy and Google stye docstrings + "myst_parser", # For parsing markdown +] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +source_suffix = { + ".rst": "restructuredtext", + ".txt": "markdown", + ".md": "markdown", +} + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..afd9a7c --- /dev/null +++ b/docs/index.md @@ -0,0 +1,114 @@ +## Qubed + +# Datacubes, Trees and Compressed trees + +This first part is essentially a abridged version of the [datacube spec](https://github.com/ecmwf/datacube-spec), see that document for more detail and the canonical source of truth on the matter. + +Qubed is primarily geared towards dealing with datafiles uniquely labeled by sets of key value pairs. We'll call a set of key value pairs that uniquely labels some data an `identifier`. Here's an example: + +```python +{'class': 'd1', + 'dataset': 'climate-dt', + 'generation': '1', + 'date': '20241102', + 'resolution': 'high', + 'time': '0000', +} +``` + +Unfortunately, we have more than one data file. If we are lucky, the set of identifiers that current exists might form a dense datacube that we could represent like this: + +```python +{'class': ['d1', 'd2'], + 'dataset': 'climate-dt', + 'generation': ['1','2','3'], + 'model': 'icon', + 'date': ['20241102','20241103'], + 'resolution': ['high','low'], + 'time': ['0000', '0600', '1200', '1800'], +} +``` + +with the property that any particular choice for a value for any key will correspond to datafile that exists. + +To save space I will also represent this same thing like this: +``` +- class=d1/d2, dataset=climate-dt, generation=1/2/3, model=icon, date=20241102/20241103, resolution=high/low, time=0000/0600/1200/1800 +``` + +Unfortunately, we are not lucky and our datacubes are not always dense. In this case we might instead represent which data exists using a tree: +``` +root +├── class=od +│ ├── expver=0001 +│ │ ├── param=1 +│ │ └── param=2 +│ └── expver=0002 +│ ├── param=1 +│ └── param=2 +└── class=rd + ├── expver=0001 + │ ├── param=1 + │ ├── param=2 + │ └── param=3 + └── expver=0002 + ├── param=1 + └── param=2 +``` + +But it's clear that the above tree contains a lot of redundant information. Many of the subtrees are identical for example. Indeed in practice a lot of our data turns out to be 'nearly dense' in that it contains many dense datacubes within it. + +There are many valid ways one could compress this tree. If we add the restriction that no identical key=value pairs can be adjacent then here is the compressed tree we might get: + +``` +root +├── class=rd +│ ├── expver=0001, param=1/2/3 +│ └── expver=0002, param=1/2 +└── class=od, expver=0001/0002, param=1/2 +``` + +Without the above restriction we could instead have: + +``` +root +├── class=rd +│ ├── expver=0001, param=3 +│ └── expver=0001/0002, param=1/2 +└── class=od, expver=0001/0002, param=1/2 +``` + +but we do not allow this because it would mean we would have to take multiple branches in order to find data with `expver=0001`. + +What we have now is a tree of dense datacubes which represents a single larger sparse datacube in a more compact manner. For want of a better word we'll call it a Qube. + +## API + +Qubed will provide a core compressed tree data structure called a Qube with: + +Methods to convert to and from: + - [x] A human readable representation like those seen above. + - [x] An HTML version where subtrees can be collapsed. + - [ ] An compact protobuf-based binary format + - [x] Nested python dictionaries or JSON + - [/] The output of [fdb list](https://confluence.ecmwf.int/display/FDB/fdb-list) + - [ ] [mars list][mars list] + - [ ] [constraints.json][constraints] + +[constraints]: (https://object-store.os-api.cci2.ecmwf.int/cci2-prod-catalogue/resources/reanalysis-era5-land/constraints_a0ae5b42d67869674e13fba9fd055640bcffc37c24578be1f465d7d5ab2c7ee5.json +[mars list]: https://git.ecmwf.int/projects/CDS/repos/cads-forms-reanalysis/browse/reanalysis-era5-single-levels/gecko-config/mars.list?at=refs%2Fheads%2Fprod + +Useful algorithms: + - [x] Compression + - [/] Union/Intersection/Difference + +Performant Membership Queries + - Identifier membership + - Datacube query (selection) + +Metadata Storage + + + + + diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd