Compare commits

..

1 Commits

Author SHA1 Message Date
Tom
4a97eea317 Initial code 2025-03-28 09:28:48 +00:00
96 changed files with 4863 additions and 22444 deletions

4
.gitignore vendored
View File

@ -17,7 +17,3 @@ dist/
Cargo.lock
src/python/qubed/_version.py
*.ipynb
cmake_build/
tests/data/
*.secret
node_modules/

View File

@ -7,8 +7,7 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
# - id: check-yaml
# - id: check-added-large-files
- id: check-added-large-files
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.7
hooks:

View File

@ -8,9 +8,14 @@ repository = "https://github.com/ecmwf/qubed"
# rsfdb = {git = "https://github.com/ecmwf/rsfdb", branch = "develop"}
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
pyo3 = "0.25"
lasso = "0.7.3"
itertools = "0.14.0"
# For fdb binding
libc = "0.2"
libloading = "0.6"
once_cell = "1.8"
[dependencies.pyo3]
version = "0.23"
[package.metadata.maturin]
version-from-git = true
@ -20,8 +25,6 @@ name = "tree_traverser"
crate-type = ["cdylib"]
path = "./src/rust/lib.rs"
# [patch.'https://github.com/ecmwf/rsfdb']
# rsfdb = { path = "../rsfdb" }
# [patch.'https://github.com/ecmwf-projects/rsfindlibs']
# rsfindlibs = { path = "../rsfindlibs" }
[features]
extension-module = ["pyo3/extension-module"]
default = ["extension-module"]

View File

@ -1,4 +1,4 @@
# <p align="center"><img src="https://raw.githubusercontent.com/ecmwf/qubed/refs/heads/main/docs/_static/banner.svg" width="1000"></p>
# Q<sup>3</sup> Quick Querying of Qubes
[![Static Badge](https://github.com/ecmwf/codex/raw/refs/heads/main/Project%20Maturity/emerging_badge.svg)](https://github.com/ecmwf/codex/raw/refs/heads/main/Project%20Maturity#emerging)
[![Docs](https://readthedocs.org/projects/qubed/badge/?version=latest)](https://qubed.readthedocs.io/en/latest/)
[![PyPi](https://img.shields.io/pypi/v/qubed.svg)](https://pypi.org/project/qubed/)
@ -46,7 +46,7 @@ In addition to this core datastructure, this repostitory contains a collection o
- 🌟 Implements our proposed [Datacube STAC Extension](./structured_stac.md).
- 🛠️ Allows efficient traversal of ECMWF's datacubes.
- Part of the implementation of this is [🌲 Tree Compressor](./tree_compresser), a **compressed tree representation** optimised for storing trees with many duplicated subtress.
- 🔗 **[Live Example](https://qubed.lumi.apps.dte.destination-earth.eu/api/v1/stac/climate-dt/?class=od%2Cd1&dataset=climate-dt)**.
- 🔗 **[Live Example](https://climate-catalogue.lumi.apps.dte.destination-earth.eu/api/stac?root=root&activity=story-nudging%2Cscenariomip&class=d1)**.
---
@ -54,7 +54,7 @@ In addition to this core datastructure, this repostitory contains a collection o
> **Web Frontend**
- 👀 Displays data from the **STAC Server** in an intuitive user interface.
- 🌍 **[Try the Live Demo](https://qubed.lumi.apps.dte.destination-earth.eu/)**.
- 🌍 **[Try the Live Demo](https://climate-catalogue.lumi.apps.dte.destination-earth.eu/)**.
---

View File

@ -1,5 +1,5 @@
apiVersion: v2
name: qubed
name: stac-server
description: A Helm chart for the STAC Server with frontend, STAC API and caching service.
type: application
version: 0.1.0

View File

@ -10,7 +10,7 @@ spec:
http:
paths:
{{- if .Values.stacServer.enabled }}
- path: /
- path: /api
pathType: Prefix
backend:
service:
@ -18,6 +18,15 @@ spec:
port:
number: {{ .Values.stacServer.servicePort }}
{{- end }}
{{- if .Values.webQueryBuilder.enabled }}
- path: /
pathType: Prefix
backend:
service:
name: web-query-builder
port:
number: {{ .Values.webQueryBuilder.servicePort }}
{{- end }}
tls:
- hosts:
- {{ .Values.ingress.hostname }}

View File

@ -0,0 +1,28 @@
# templates/redis-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
spec:
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: "redis:alpine"
command: ["redis-server", "--appendonly", "yes"]
ports:
- containerPort: {{ .Values.redis.servicePort }}
# volumeMounts:
# - mountPath: /data
# name: redis-data
# volumes:
# - name: redis-data
# persistentVolumeClaim:
# claimName: redis-data

View File

@ -0,0 +1,14 @@
# templates/redis-pvc.yaml
{{- if .Values.redis.pvc.enabled }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: redis-data
spec:
accessModes: {{ .Values.redis.pvc.accessModes }}
resources:
requests:
storage: {{ .Values.redis.pvc.size }}
storageClassName: {{ .Values.redis.pvc.storageClassName | quote }}
{{- end }}

View File

@ -0,0 +1,11 @@
apiVersion: v1
kind: Service
metadata:
name: redis
spec:
selector:
app: redis
ports:
- protocol: TCP
port: {{ .Values.redis.servicePort }}
targetPort: {{ .Values.redis.servicePort }}

View File

@ -0,0 +1,11 @@
# apiVersion: v1
# kind: ConfigMap
# metadata:
# name: stack-server
# data:
# file1.txt: |-
# {{ .Files.Get "files/file1.txt" | nindent 2 }}
# file2.txt: |-
# {{ .Files.Get "files/file2.txt" | nindent 2 }}
# file3.txt: |-
# {{ .Files.Get "files/file3.txt" | nindent 2 }}

View File

@ -5,7 +5,7 @@ kind: Deployment
metadata:
name: stac-server
spec:
replicas: {{ .Values.stacServer.replicas }}
replicas: 1 # Adjust as needed
selector:
matchLabels:
app: stac-server
@ -14,30 +14,20 @@ spec:
labels:
app: stac-server
spec:
initContainers:
- name: wait-for-redis
image: busybox
command:
[
'sh', '-c',
'until nc -z -v -w30 {{ .Values.stacServer.environment.REDIS_HOST }} {{ .Values.redis.service.port }}; do echo "Waiting for Redis..."; sleep 5; done;'
]
containers:
- name: stac-server
image: "{{ .Values.stacServer.image.repository }}:{{ .Values.stacServer.image.tag }}"
imagePullPolicy: {{ .Values.stacServer.image.pullPolicy }}
env:
- name: API_KEY
valueFrom:
secretKeyRef:
name: api-key
key: API_KEY
- name: API_URL
value: "https://{{ .Values.ingress.hostname }}/api/v1/"
- name: REDIS_HOST
value: "{{ .Values.stacServer.environment.REDIS_HOST }}"
ports:
- containerPort: {{ .Values.stacServer.servicePort }}
---
apiVersion: v1
kind: Service
metadata:
name: stac-server
spec:
selector:
app: stac-server
ports:
- protocol: TCP
port: {{ .Values.stacServer.servicePort }}
targetPort: {{ .Values.stacServer.servicePort }}
type: ClusterIP

View File

@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: stac-server
spec:
selector:
app: stac-server
ports:
- protocol: TCP
port: {{ .Values.stacServer.servicePort }}
targetPort: {{ .Values.stacServer.servicePort }}
type: ClusterIP

View File

@ -0,0 +1,37 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: web-query-builder
spec:
replicas: {{ .Values.webQueryBuilder.replicas }}
selector:
matchLabels:
app: web-query-builder
template:
metadata:
labels:
app: web-query-builder
spec:
containers:
- name: web-query-builder
image: "{{ .Values.webQueryBuilder.image.repository }}:{{ .Values.webQueryBuilder.image.tag }}"
imagePullPolicy: {{ .Values.webQueryBuilder.image.pullPolicy }}
env:
- name: API_HOST
value: stac-server
ports:
- containerPort: {{ .Values.webQueryBuilder.servicePort }}
---
apiVersion: v1
kind: Service
metadata:
name: web-query-builder
spec:
selector:
app: web-query-builder
ports:
- protocol: TCP
port: {{ .Values.webQueryBuilder.servicePort }}
targetPort: {{ .Values.webQueryBuilder.servicePort }}
type: ClusterIP

View File

@ -1,13 +1,33 @@
redis:
servicePort: 6379
pvc:
enabled: true
storageClassName: ""
accessModes:
- ReadWriteOnce
size: 1Gi
service:
port: 6379
stacServer:
enabled: true
replicas: 1
image:
repository: "eccr.ecmwf.int/qubed/stac_server"
tag: "latest"
pullPolicy: Always
servicePort: 80
environment:
REDIS_HOST: "redis"
webQueryBuilder:
enabled: true
image:
repository: "eccr.ecmwf.int/qubed/web_query_builder"
tag: "latest"
pullPolicy: Always
servicePort: 80
ingress:
enabled: True
tlsSecretName: "lumi-wildcard-tls"
hostname: "qubed.lumi.apps.dte.destination-earth.eu"
hostname: "climate-catalogue.lumi.apps.dte.destination-earth.eu"

View File

@ -1,5 +1,16 @@
services:
# redis server holds the catalog data blob
redis:
image: redis:alpine
container_name: redis
command: ["redis-server", "--appendonly", "yes"]
volumes:
- ./redis-data:/data
ports:
- "6379:6379"
restart: always
# STAC Server
stac_server:
# image: stac-server:latest
@ -9,24 +20,26 @@ services:
dockerfile: Dockerfile
target: stac_server
ports:
- "8124:80"
- "8124:8080"
environment:
- REDIS_HOST=redis
- CONFIG_DIR=/config
volumes:
- ./stac_server:/code/stac_server
- ./TreeTraverser:/code/TreeTraverser
# restart: always
web_query_builder:
# image: web_query_builder:latest
container_name: web_query_builder
build:
context: .
dockerfile: Dockerfile
target: web_query_builder
ports:
- "8125:80"
environment:
- API_URL=http://127.0.0.1:8124/api/v1/stac/climate-dt
volumes:
- ./web_query_builder:/code/web_query_builder
restart: always
# web_query_builder:
# # image: web_query_builder:latest
# container_name: web_query_builder
# build:
# context: .
# dockerfile: Dockerfile
# target: web_query_builder
# ports:
# - "8125:8080"
# environment:
# - CONFIG_DIR=/config
# volumes:
# - ./web_query_builder:/code/web_query_builder
# restart: always

View File

@ -1,6 +0,0 @@
---
type: remote
host: databridge-prod-catalogue3-ope.ewctest.link
port: 10000
engine: remote
store: remote

View File

@ -1,6 +0,0 @@
---
type: remote
host: databridge-prod-catalogue1-ope.ewctest.link
port: 10000
engine: remote
store: remote

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1253
config/local/language.yaml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -24,13 +24,32 @@ FROM base AS stac_server
COPY stac_server/requirements.txt /code/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
COPY ./src /code/qubed/src
COPY ./pyproject.toml /code/qubed/
COPY ./Cargo.toml /code/qubed/
COPY ./README.md /code/qubed/
# Todo: don't embed this here, mount them at runtime
# ENV CONFIG_DIR=/config/
# COPY config/destinE/config.yaml /config/config.yaml
# COPY config/destinE/schema /config/schema
# COPY config/destinE/language.yaml /config/language.yaml
RUN pip install --no-cache-dir -e /code/qubed
COPY ./tree_compresser /code/tree_compresser
# Clone the rsfdb and rsfindlibs repos manually because they're private
# RUN --mount=type=ssh git clone ssh://git@github.com/ecmwf/rsfdb.git
# RUN --mount=type=ssh git clone ssh://git@github.com/ecmwf/rsfindlibs.git
COPY stac_server/deps/rsfdb /code/rsfdb
COPY stac_server/deps/rsfindlibs /code/rsfindlibs
RUN pip install --no-cache-dir -e /code/tree_compresser
COPY ./stac_server /code/stac_server
WORKDIR /code/stac_server
CMD ["fastapi", "dev", "main.py", "--proxy-headers", "--port", "80", "--host", "0.0.0.0"]
FROM base AS web_query_builder
COPY web_query_builder/requirements.txt /code/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
COPY web_query_builder /code/web_query_builder
WORKDIR /code/web_query_builder
CMD ["flask", "run", "--host", "0.0.0.0", "--port", "80"]

View File

@ -1,159 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="1000"
height="200"
viewBox="0 0 264.58333 52.916666"
version="1.1"
id="svg5"
xml:space="preserve"
inkscape:version="1.2.2 (b0a84865, 2022-12-01)"
sodipodi:docname="banner.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg"><sodipodi:namedview
id="namedview7"
pagecolor="#ffffff"
bordercolor="#000000"
borderopacity="0.25"
inkscape:showpageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
inkscape:document-units="mm"
showgrid="false"
inkscape:zoom="1.4221154"
inkscape:cx="509.80392"
inkscape:cy="23.908046"
inkscape:window-width="2665"
inkscape:window-height="1000"
inkscape:window-x="96"
inkscape:window-y="35"
inkscape:window-maximized="0"
inkscape:current-layer="g330" /><defs
id="defs2"><rect
x="641.41612"
y="32.816639"
width="73.588826"
height="29.833308"
id="rect2775" /><rect
x="500.20513"
y="263.52755"
width="244.63313"
height="143.19988"
id="rect2749" /><rect
x="467.38849"
y="331.14972"
width="258.55534"
height="132.261"
id="rect2743" /><rect
x="80.859469"
y="61.833711"
width="299.65568"
height="114.15454"
id="rect242" /><rect
x="61.833711"
y="66.590151"
width="313.925"
height="114.15454"
id="rect236" /></defs><g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text234"
style="font-weight:500;font-size:20px;line-height:1.2;font-family:Futura;-inkscape-font-specification:'Futura, Medium';white-space:pre;shape-inside:url(#rect236);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text240"
style="font-weight:500;font-size:20px;line-height:1.2;font-family:Futura;-inkscape-font-specification:'Futura, Medium';white-space:pre;shape-inside:url(#rect242);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text2741"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:20px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;shape-inside:url(#rect2743);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text2747"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:20px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;shape-inside:url(#rect2749);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><text
xml:space="preserve"
transform="scale(0.26458333)"
id="text2773"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:26.6667px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;shape-inside:url(#rect2775);display:inline;fill:#000000;stroke-width:0.926667;stroke-miterlimit:4.9;stroke-dasharray:2.78, 0.926667;stroke-opacity:0.40146" /><g
id="g349"
transform="translate(-5.8208336)"><text
xml:space="preserve"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="114.48351"
y="14.228302"
id="text2763"><tspan
sodipodi:role="line"
id="tspan2761"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="14.228302">root</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="21.701376"
id="tspan2765">├── class=cd, stream=lwda/oai, param=1/2/3</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="29.17445"
id="tspan2767">├── class=od, expver=1/2, param=1/2</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="36.647522"
id="tspan2771">├── class=rd, param=1/2/3</tspan><tspan
sodipodi:role="line"
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="114.48351"
y="44.120598"
id="tspan2769">└── ...</tspan></text><g
id="g330"><text
xml:space="preserve"
style="font-weight:500;font-size:14.1111px;line-height:0;font-family:Futura;-inkscape-font-specification:'Futura, Medium';white-space:pre;inline-size:112.115;display:inline;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="5.439929"
y="17.022402"
id="text248"
transform="translate(0,-1.5875)"><tspan
x="5.439929"
y="17.022402"
id="tspan532"><tspan
style="font-size:12.3472px;line-height:1.2"
id="tspan530">Qube</tspan></tspan></text><text
xml:space="preserve"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:5.29167px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;white-space:pre;inline-size:87.6248;display:inline;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="4.209815"
y="40.519432"
id="text2755"
transform="translate(1.744648,-4.9844494)"><tspan
x="4.209815"
y="40.519432"
id="tspan534">1. A data structure for efficiently </tspan><tspan
x="4.209815"
y="46.604852"
id="tspan536">representing and querying complex </tspan><tspan
x="4.209815"
y="52.690271"
id="tspan538">tree-like datacubes.</tspan></text><text
xml:space="preserve"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:5.29167px;line-height:1.15;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;fill:#000000;stroke-width:0.245181;stroke-miterlimit:4.9;stroke-dasharray:0.735542, 0.245181;stroke-opacity:0.40146"
x="5.4673572"
y="26.586193"
id="text2759"><tspan
sodipodi:role="line"
id="tspan2757"
style="font-style:normal;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal;stroke-width:0.245181"
x="5.4673572"
y="26.586193">[kjuːb] <tspan
style="font-style:italic;font-variant:normal;font-weight:500;font-stretch:normal;font-size:6.35px;font-family:Futura;-inkscape-font-specification:'Futura, Medium Italic';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
id="tspan495">noun</tspan></tspan></text><path
style="fill:#000000;stroke:#000000;stroke-width:0.445;stroke-miterlimit:4.9;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
d="M 6.0516036,18.417924 H 92.221177"
id="path2833"
sodipodi:nodetypes="cc" /></g></g></g></svg>

Before

Width:  |  Height:  |  Size: 10 KiB

View File

@ -23,14 +23,8 @@ There's some handy test data in the `tests/data` directory. For example:
gzip -dc tests/data/fdb_list_compact.gz| qubed convert --from=fdb --to=text --output=qube.txt
gzip -dc tests/data/fdb_list_porcelain.gz| qubed convert --from=fdb --to=json --output=qube.json
gzip -dc tests/data/fdb_list_compact.gz | qubed convert --from=fdb --to=html --output=qube.html
// Operational data stream=oper/wave/enfo/waef
fdb list class=od,expver=0001,date=0,stream=oper --compact >> operational_compact.txt
operational_compact.txt | qubed convert --from=fdb --to=text --output=operational.txt
```
## Todo
--from for

View File

@ -33,7 +33,3 @@ source_suffix = {
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]
myst_enable_extensions = [
"attrs_inline",
]

View File

@ -1,21 +0,0 @@
# Development
To install the latest stable release from PyPI (recommended):
```bash
pip install qubed
```
To install the latest version from github (requires rust):
```bash
pip install qubed@git+https://github.com/ecmwf/qubed.git@main
```
To build the develop branch from source install a rust toolchain and pip install maturin then run:
```
git clone -b develop git@github.com:ecmwf/qubed.git
cd qubed
maturin develop
```

View File

@ -12,42 +12,44 @@ jupytext:
```{toctree}
:maxdepth: 1
quickstart.md
development.md
background.md
algorithms.md
fiab.md
cmd.md
```
Qubed provides a datastructure primitive for working with trees of DataCubes. If a normal tree looks like this:
```
root
├── class=od
│ ├── expver=0001
│ │ ├── param=1
│ │ └── param=2
│ └── expver=0002
│ ├── param=1
│ └── param=2
└── class=rd
├── expver=0001
│ ├── param=1
│ ├── param=2
│ └── param=3
└── expver=0002
├── param=1
└── param=2
Qubed provides a datastructure called a Qube which represents sets of data identified by multiple key value pairs as a tree of datacubes. To understand what that means go to [Background](background.md), to just start using the library skip straight to the [Quickstart](quickstart.md).
Here's a real world dataset from the [Climate DT](https://destine.ecmwf.int/climate-change-adaptation-digital-twin-climate-dt/):
```{code-cell} python3
import requests
from qubed import Qube
climate_dt = Qube.from_json(requests.get("https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json").json())
climate_dt.html(depth=1)
```
A compressed view of the same set would be:
```
root
├── class=od, expver=0001/0002, param=1/2
└── class=rd
├── expver=0001, param=1/2/3
└── expver=0002, param=1/2
Click the arrows to expand and drill down deeper into the data. Any particular dataset is uniquely identified by a set of key value pairs:
```{code-cell} python3
import json
for i, identifier in enumerate(climate_dt.leaves()):
print(identifier)
break
```
Qubed provides a datastructure that represents this compressed cube we call a Qube. It defines all the algorithms you would expect such as intersection/union/difference, compression, search, transformation and filtering.
Here's an idea of the set of values each key can take:
```{code-cell} python3
axes = climate_dt.axes()
for key, values in axes.items():
print(f"{key} : {list(sorted(values))[:10]}")
```
To get a little more background on the motivation and structure of a Qube go to [Background](background.md), for a more hands on intro, go to [Quickstart](quickstart.md).
This dataset isn't dense, you can't choose any combination of the above key values pairs, but it does contain many dense datacubes. Hence it makes sense to store and process the set as a tree of dense datacubes, what we call a Qube. For a sense of scale, this dataset contains about 200 million distinct datasets but only contains a few thousand unique nodes.
```{code-cell} python3
print(f"""
Distinct datasets: {climate_dt.n_leaves},
Number of nodes in the tree: {climate_dt.n_nodes}
""")
```

View File

@ -8,54 +8,32 @@ jupytext:
---
# Quickstart
First install qubed with `pip install qubed`. Now, let's dive in with a real world dataset from the [Climate DT](https://destine.ecmwf.int/climate-change-adaptation-digital-twin-climate-dt/). We'll pull a prebuilt qube from github and render it in it's default HTML representation.
```{code-cell} python3
import requests
from qubed import Qube
climate_dt = Qube.from_json(requests.get("https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json").json())
climate_dt.html(depth=1)
## Installation
To install the latest stable release from PyPI (recommended):
```bash
pip install qubed
```
Or to build and install the latest version from github (requires cargo):
```bash
pip install qubed@git+https://github.com/ecmwf/qubed.git@main
```
Click the arrows to expand and drill down deeper into the data.
## Development
```{note}
There is currently a simple Qube web browser hosted [here](https://qubed.lumi.apps.dte.destination-earth.eu/). Browse that and copy the 'Example Qube Code' to download a Qube representing the selection at that point. You'll get something like `Qube.from_json(requests.get("https://qubed.lumi.apps.dte.destination-earth.eu/api/v1/select/climate-dt/?").json())`{l=python}
To build the develop branch from source install a rust toolchain and pip install maturin then run:
```
git clone -b develop git@github.com:ecmwf/qubed.git
cd qubed
maturin develop
```
Fundamentally a Qube represents a set identifiers which are a set of key value pairs, here's the one leaf in the Climate DT dataset:
```{code-cell} python3
next(climate_dt.leaves())
```
We can look at the set of values each key can take:
```{code-cell} python3
axes = climate_dt.axes()
for key, values in axes.items():
print(f"{key} : {list(sorted(values))[:10]}")
```
This dataset isn't dense, you can't choose any combination of the above key values pairs, but it does contain many dense datacubes. Hence it makes sense to store and process the set as a tree of dense datacubes, which is what a Qube. For a sense of scale, this dataset contains about 8 million distinct datasets but only contains a few hundred unique nodes.
```{code-cell} python3
import objsize
print(f"""
Distinct datasets: {climate_dt.n_leaves}
Number of nodes in the tree: {climate_dt.n_nodes}
Number of dense datacubes within this qube: {len(list(climate_dt.datacubes()))}
In memory size according to objsize: {objsize.get_deep_size(climate_dt) / 2**20:.0f} MB
""")
```
## Building your own Qubes
You can do it from nested dictionaries with keys in the form "{key=value}":
## Usage
Make an uncompressed qube:
```{code-cell} python3
from qubed import Qube
q1 = Qube.from_dict({
q = Qube.from_dict({
"class=od" : {
"expver=0001": {"param=1":{}, "param=2":{}},
"expver=0002": {"param=1":{}, "param=2":{}},
@ -65,63 +43,20 @@ q1 = Qube.from_dict({
"expver=0002": {"param=1":{}, "param=2":{}},
},
})
print(f"{q1.n_leaves = }, {q1.n_nodes = }")
q1
print(f"{q.n_leaves = }, {q.n_nodes = }")
q
```
If someone sends you a printed qube you can convert that back to a Qube too:
Compress it:
```{code-cell} python3
q2 = Qube.from_tree("""
root, frequency=6:00:00
├── levtype=pl, param=t, levelist=850, threshold=-2/-4/-8/2/4/8
└── levtype=sfc
├── param=10u/10v, threshold=10/15
├── param=2t, threshold=273.15
└── param=tp, threshold=0.1/1/10/100/20/25/5/50
""")
q2
```
We would not recommend trying to write this representation by hand though.
Finally, quite a flexible approach is to take the union of a series of dense datacubes:
```{code-cell} python3
q3 = Qube.from_datacube(
dict(
param="10u/10v/2d/2t/cp/msl/skt/sp/tcw/tp".split("/"),
threshold="*",
levtype="sfc",
frequency="6:00:00",
)
) | Qube.from_datacube(
dict(
param="q/t/u/v/w/z".split("/"),
threshold="*",
levtype="pl",
level="50/100/150/200/250/300/400/500/600/700/850".split("/"),
frequency="6:00:00",
)
)
q3
```
## Operations on Qubes
Going back to that first qube:
```{code-cell} python3
q1
```
We can compress it:
```{code-cell} python3
cq = q1.compress()
assert cq.n_leaves == q1.n_leaves
cq = q.compress()
assert cq.n_leaves == q.n_leaves
print(f"{cq.n_leaves = }, {cq.n_nodes = }")
cq
```
With the HTML representation you can click on the leaves to expand them. You can copy a path representation of a node to the clipboard by alt/option/⌥ clicking on it. You can then extract that node in code using `[]`:
```{code-cell} python3
@ -147,7 +82,30 @@ dq = Qube.from_datacube({
```
## Iteration
### Tree Construction
One of the quickest ways to construct non-trivial trees is to use the `Qube.from_datacube` method to construct dense trees and then use the set operations to combine or intersect them:
```{code-cell} python3
q = Qube.from_datacube({
"class": "d1",
"dataset": ["climate-dt", "another-value"],
'generation': ['1', "2", "3"],
})
r = Qube.from_datacube({
"class": "d1",
"dataset": ["weather-dt", "climate-dt"],
'generation': ['1', "2", "3", "4"],
})
q | r
```
### Iteration / Flattening
Iterate over the leaves:
@ -159,13 +117,26 @@ for i, identifier in enumerate(cq.leaves()):
break
```
Or if you can it's more efficient to iterate over the datacubes:
Iterate over the datacubes:
```{code-cell} python3
list(cq.datacubes())
cq.datacubes()
```
### A Real World Example
Load a larger example qube:
```{code-cell} python3
import requests
qube_json = requests.get("https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json").json()
climate_dt = Qube.from_json(qube_json)
# Using the html or print methods is optional but lets you specify things like the depth of the tree to display.
print(f"{climate_dt.n_leaves = }, {climate_dt.n_nodes = }")
climate_dt.html(depth=1) # Limit how much is open initially, click leave to see more.
```
## Selection
Select a subset of the tree:
```{code-cell} python3
@ -189,7 +160,7 @@ for key, values in axes.items():
```
## Set Operations
### Set Operations
The union/intersection/difference of two dense datacubes is not itself dense.
@ -224,7 +195,7 @@ Symmetric Difference:
(A ^ B).print();
```
## Transformations
### Transformations
`q.transform` takes a python function from one node to one or more nodes and uses this to build a new tree. This can be used for simple operations on the key or values but also to split or remove nodes. Note that you can't use it to merge nodes beause it's only allowed to see one node at a time.
@ -232,42 +203,3 @@ Symmetric Difference:
def capitalize(node): return node.replace(key = node.key.capitalize())
climate_dt.transform(capitalize).html(depth=1)
```
## Save to disk
There is currently a very simple JSON serialisation format. More compact binary serialisations are planned.
```{code-cell} python3
json = climate_dt.to_json()
Qube.from_json(json) == climate_dt
```
## Advanced Selection
There is currently partial support for different datatypes in addition to strings. Here we can convert datatypes by key to ints and timedeltas and then use functions as filters in select.
```{code-cell} python3
from datetime import timedelta, datetime
def to_timedelta(t):
dt = datetime.strptime(t, "%H:%M:%S")
return timedelta(hours=dt.hour, minutes=dt.minute, seconds=dt.second)
q = Qube.from_tree("""
root, frequency=6:00:00
├── levtype=pl, levelist=850, threshold=-2/-4/-8/2/4/8
└── levtype=sfc
├── param=10u/10v, threshold=10/15
├── param=2t, threshold=273.15
└── param=tp, threshold=0.1/1/10/100/20/25/5/50
""").convert_dtypes({
"threshold": float,
"levelist": int,
"frequency": to_timedelta,
})
r = q.select({
"threshold": lambda t: t > 5,
"frequency": lambda dt: dt > timedelta(hours = 2),
})
r
```

View File

@ -1,3 +1,2 @@
numpy
scipy
objsize

View File

@ -19,7 +19,6 @@ dynamic = ["version"]
dependencies = [
"frozendict",
"numpy",
"protobuf",
# CLI
"rich",

View File

@ -7,3 +7,9 @@ sudo docker build \
--target=stac_server \
.
sudo docker push eccr.ecmwf.int/qubed/stac_server:latest
sudo docker build \
--tag=eccr.ecmwf.int/qubed/web_query_builder:latest \
--target=web_query_builder \
.
sudo docker push eccr.ecmwf.int/qubed/web_query_builder:latest

View File

@ -1,2 +1 @@
# helm install qubed chart -n qubed
helm upgrade qubed chart -n qubed
helm upgrade stac-server chart -n stac-server

19
scripts/load_redis.py Executable file
View File

@ -0,0 +1,19 @@
#! .venv/bin/python
import redis
import yaml
import json
print("Opening redis connection")
r = redis.Redis(host="redis", port=6379, db=0)
print("Loading data from local files")
with open("config/climate-dt/compressed_tree.json") as f:
compressed_catalog = json.load(f)
with open("config/climate-dt/language.yaml") as f:
mars_language = yaml.safe_load(f)["_field"]
print("Storing data in redis")
r.set("compressed_catalog", json.dumps(compressed_catalog))
r.set("mars_language", json.dumps(mars_language))

View File

@ -1 +0,0 @@
kubectl -n qubed logs deployment/stac-server

View File

@ -1,2 +1,3 @@
# kubectl rollout restart deployment/redis
kubectl -n qubed rollout restart deployment/stac-server
kubectl rollout restart deployment/web-query-builder
kubectl rollout restart deployment/stac-server

3
scripts/setup.sh Normal file
View File

@ -0,0 +1,3 @@
python3 -m venv .venv
source .venv/bin/activate
pip install pyyaml redis

View File

@ -1,180 +1,97 @@
# This causes python types to be evaluated later,
# allowing you to reference types like Qube inside the definion of the Qube class
# without having to do "Qube"
from __future__ import annotations
import dataclasses
import functools
import json
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass, field
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import Any, Iterable, Iterator, Literal, Mapping, Self, Sequence
from typing import Any, Iterable, Iterator, Literal, Sequence
import numpy as np
from frozendict import frozendict
from . import set_operations
from .metadata import from_nodes
from .protobuf.adapters import proto_to_qube, qube_to_proto
from .node_types import NodeData, RootNodeData
from .tree_formatters import (
HTML,
_display,
node_tree_to_html,
node_tree_to_string,
)
from .value_types import (
QEnum,
ValueGroup,
WildcardGroup,
values_from_json,
)
@dataclass
class AxisInfo:
key: str
type: Any
depths: set[int]
values: set
def combine(self, other: Self):
self.key = other.key
self.type = other.type
self.depths.update(other.depths)
self.values.update(other.values)
# print(f"combining {self} and {other} getting {result}")
def to_json(self):
return {
"key": self.key,
"type": self.type.__name__,
"values": list(self.values),
"depths": list(self.depths),
}
@dataclass(frozen=True, eq=True, order=True, unsafe_hash=True)
class QubeNamedRoot:
"Helper class to print a custom root name"
key: str
children: tuple[Qube, ...] = ()
def summary(self) -> str:
return self.key
from .value_types import QEnum, ValueGroup, WildcardGroup, values_from_json
@dataclass(frozen=False, eq=True, order=True, unsafe_hash=True)
class Qube:
key: str
values: ValueGroup
metadata: frozendict[str, np.ndarray] = field(
default_factory=lambda: frozendict({}), compare=False
)
children: tuple[Qube, ...] = ()
is_root: bool = False
is_leaf: bool = False
depth: int = field(default=0, compare=False)
shape: tuple[int, ...] = field(default=(), compare=False)
data: NodeData
children: tuple["Qube", ...]
@classmethod
def make_node(
cls,
key: str,
values: Iterable | QEnum | WildcardGroup,
children: Iterable[Qube],
metadata: Mapping[str, np.ndarray] = {},
is_root: bool = False,
is_leaf: bool | None = None,
) -> Qube:
if isinstance(values, ValueGroup):
values = values
else:
values = QEnum(values)
@property
def key(self) -> str:
return self.data.key
if not isinstance(values, WildcardGroup) and not is_root:
assert len(values) > 0, "Nodes must have at least one value"
@property
def values(self) -> ValueGroup:
return self.data.values
children = tuple(sorted(children, key=lambda n: ((n.key, n.values.min()))))
@property
def metadata(self) -> frozendict[str, Any]:
return self.data.metadata
return cls(
key,
values=values,
children=children,
metadata=frozendict(metadata),
is_root=is_root,
is_leaf=(not len(children)) if is_leaf is None else is_leaf,
def replace(self, **kwargs) -> "Qube":
data_keys = {
k: v for k, v in kwargs.items() if k in ["key", "values", "metadata"]
}
node_keys = {k: v for k, v in kwargs.items() if k == "children"}
if not data_keys and not node_keys:
return self
if not data_keys:
return dataclasses.replace(self, **node_keys)
return dataclasses.replace(
self, data=dataclasses.replace(self.data, **data_keys), **node_keys
)
@classmethod
def make_root(cls, children: Iterable[Qube], metadata={}) -> Qube:
def update_depth_shape(children, depth, shape):
for child in children:
child.depth = depth + 1
child.shape = shape + (len(child.values),)
update_depth_shape(child.children, child.depth, child.shape)
update_depth_shape(children, depth=0, shape=(1,))
return cls.make_node(
"root",
values=QEnum(("root",)),
children=children,
metadata=metadata,
is_root=True,
)
def replace(self, **kwargs) -> Qube:
return dataclasses.replace(self, **kwargs)
def summary(self) -> str:
if self.is_root:
return self.key
return f"{self.key}={self.values.summary()}" if self.key != "root" else "root"
return self.data.summary()
@classmethod
def load(cls, path: str | Path) -> Qube:
with open(path, "r") as f:
return Qube.from_json(json.load(f))
def make(cls, key: str, values: ValueGroup, children, **kwargs) -> "Qube":
return cls(
data=NodeData(key, values, metadata=kwargs.get("metadata", frozendict())),
children=tuple(sorted(children, key=lambda n: ((n.key, n.values.min())))),
)
@classmethod
def from_datacube(cls, datacube: Mapping[str, str | Sequence[str]]) -> Qube:
def root_node(cls, children: Iterable["Qube"]) -> "Qube":
return cls.make("root", QEnum(("root",)), children)
@classmethod
def from_datacube(cls, datacube: dict[str, str | Sequence[str]]) -> "Qube":
key_vals = list(datacube.items())[::-1]
children: list[Qube] = []
children: list["Qube"] = []
for key, values in key_vals:
values_group: ValueGroup
if values == "*":
values_group = WildcardGroup()
elif isinstance(values, list):
values_group = QEnum(values)
else:
values_group = QEnum([values])
values = WildcardGroup()
elif not isinstance(values, list):
values = [values]
children = [cls.make_node(key, values_group, children)]
if isinstance(values, list):
values = QEnum(values)
return cls.make_root(children)
children = [cls.make(key, values, children)]
return cls.root_node(children)
@classmethod
def from_json(cls, json: dict) -> Qube:
def from_json(json: dict, depth=0) -> Qube:
return Qube.make_node(
def from_json(cls, json: dict) -> "Qube":
def from_json(json: dict) -> Qube:
return Qube.make(
key=json["key"],
values=values_from_json(json["values"]),
metadata=frozendict(json["metadata"]) if "metadata" in json else {},
children=(from_json(c, depth + 1) for c in json["children"]),
is_root=(depth == 0),
children=(from_json(c) for c in json["children"]),
)
return from_json(json)
@classmethod
def from_nodes(cls, nodes: dict[str, dict], add_root: bool = True):
return from_nodes(cls, nodes, add_root)
def to_json(self) -> dict:
def to_json(node: Qube) -> dict:
return {
@ -187,48 +104,23 @@ class Qube:
return to_json(self)
@classmethod
def from_dict(cls, d: dict) -> Qube:
def from_dict(cls, d: dict) -> "Qube":
def from_dict(d: dict) -> Iterator[Qube]:
for k, children in d.items():
key, values = k.split("=")
values = values.split("/")
# children == {"..." : {}}
# is a special case to represent trees with leaves we don't know about
if frozendict(children) == frozendict({"...": {}}):
yield Qube.make_node(
key=key,
values=values,
children={},
is_leaf=False,
)
# Special case for Wildcard values
if values == ["*"]:
values = WildcardGroup()
else:
values = QEnum(values)
yield Qube.make_node(
yield Qube.make(
key=key,
values=values,
children=from_dict(children),
)
return Qube.make_root(list(from_dict(d)))
def to_dict(self) -> dict:
def to_dict(q: Qube) -> tuple[str, dict]:
key = f"{q.key}={','.join(str(v) for v in q.values)}"
return key, dict(to_dict(c) for c in q.children)
return to_dict(self)[1]
@classmethod
def from_protobuf(cls, msg: bytes) -> Qube:
return proto_to_qube(cls, msg)
def to_protobuf(self) -> bytes:
return qube_to_proto(self)
return Qube.root_node(list(from_dict(d)))
@classmethod
def from_tree(cls, tree_str):
@ -282,66 +174,59 @@ class Qube:
return cls.from_dict(root)
@classmethod
def empty(cls) -> Qube:
return Qube.make_root([])
def empty(cls) -> "Qube":
return Qube.root_node([])
def __str_helper__(self, depth=None, name=None) -> str:
node = self
if name is not None:
node = node.replace(key=name)
out = "".join(node_tree_to_string(node=node, depth=depth))
if out[-1] == "\n":
out = out[:-1]
return out
def __str__(self):
return self.__str_helper__()
def __repr__(self):
return f"Qube({self.__str_helper__()})"
def __str__(self, depth=None, name=None) -> str:
node = (
dataclasses.replace(
self,
data=RootNodeData(key=name, values=self.values, metadata=self.metadata),
)
if name is not None
else self
)
return "".join(node_tree_to_string(node=node, depth=depth))
def print(self, depth=None, name: str | None = None):
print(self.__str_helper__(depth=depth, name=name))
print(self.__str__(depth=depth, name=name))
def html(
self,
depth=2,
collapse=True,
name: str | None = None,
info: Callable[[Qube], str] | None = None,
) -> HTML:
node = self
if name is not None:
node = node.replace(key=name)
return HTML(
node_tree_to_html(node=node, depth=depth, collapse=collapse, info=info)
def html(self, depth=2, collapse=True, name: str | None = None) -> HTML:
node = (
dataclasses.replace(
self,
data=RootNodeData(key=name, values=self.values, metadata=self.metadata),
)
if name is not None
else self
)
return HTML(node_tree_to_html(node=node, depth=depth, collapse=collapse))
def _repr_html_(self) -> str:
return node_tree_to_html(self, depth=2, collapse=True)
# Allow "key=value/value" / qube to prepend keys
def __rtruediv__(self, other: str) -> Qube:
def __rtruediv__(self, other: str) -> "Qube":
key, values = other.split("=")
values_enum = QEnum((values.split("/")))
return Qube.make_root([Qube.make_node(key, values_enum, self.children)])
values = QEnum((values.split("/")))
return Qube.root_node([Qube.make(key, values, self.children)])
def __or__(self, other: Qube) -> Qube:
def __or__(self, other: "Qube") -> "Qube":
return set_operations.operation(
self, other, set_operations.SetOperation.UNION, type(self)
)
def __and__(self, other: Qube) -> Qube:
def __and__(self, other: "Qube") -> "Qube":
return set_operations.operation(
self, other, set_operations.SetOperation.INTERSECTION, type(self)
)
def __sub__(self, other: Qube) -> Qube:
def __sub__(self, other: "Qube") -> "Qube":
return set_operations.operation(
self, other, set_operations.SetOperation.DIFFERENCE, type(self)
)
def __xor__(self, other: Qube) -> Qube:
def __xor__(self, other: "Qube") -> "Qube":
return set_operations.operation(
self, other, set_operations.SetOperation.SYMMETRIC_DIFFERENCE, type(self)
)
@ -357,26 +242,18 @@ class Qube:
else:
yield leaf
def leaf_nodes(self) -> "Iterable[tuple[dict[str, str], Qube]]":
for value in self.values:
if not self.children:
yield ({self.key: value}, self)
for child in self.children:
for leaf in child.leaf_nodes():
if self.key != "root":
yield ({self.key: value, **leaf[0]}, leaf[1])
else:
yield leaf
def leaves_with_metadata(
self, indices=()
) -> Iterator[tuple[dict[str, str], dict[str, str | np.ndarray]]]:
) -> Iterable[tuple[dict[str, str], dict[str, str]]]:
if self.key == "root":
for c in self.children:
yield from c.leaves_with_metadata(indices=())
for leaf in c.leaves_with_metadata(indices=()):
yield leaf
return
for index, value in enumerate(self.values):
# print(self.key, index, indices, value)
# print({k: np.shape(v) for k, v in self.metadata.items()})
indexed_metadata = {
k: vs[indices + (index,)] for k, vs in self.metadata.items()
}
@ -395,29 +272,25 @@ class Qube:
else:
yield leaf, metadata
def datacubes(self) -> Iterable[dict[str, Any | list[Any]]]:
def to_list_of_cubes(node: Qube) -> Iterable[dict[str, Any | list[Any]]]:
if node.key == "root":
for c in node.children:
yield from to_list_of_cubes(c)
def datacubes(self) -> "Qube":
def to_list_of_cubes(node: Qube) -> Iterable[Qube]:
if not node.children:
yield node
# print(node.key)
for c in node.children:
# print(c)
for sub_cube in to_list_of_cubes(c):
yield node.replace(children=[sub_cube])
else:
if not node.children:
yield {node.key: list(node.values)}
return Qube.root_node((q for c in self.children for q in to_list_of_cubes(c)))
for c in node.children:
for sub_cube in to_list_of_cubes(c):
yield {node.key: list(node.values)} | sub_cube
return to_list_of_cubes(self)
def __getitem__(self, args) -> Qube:
def __getitem__(self, args) -> "Qube":
if isinstance(args, str):
specifiers = args.split(",")
current = self
for specifier in specifiers:
key, values_str = specifier.split("=")
values = values_str.split("/")
key, values = specifier.split("=")
values = values.split("/")
for c in current.children:
if c.key == key and set(values) == set(c.values):
current = c
@ -426,16 +299,16 @@ class Qube:
raise KeyError(
f"Key '{key}' not found in children of '{current.key}', available keys are {[c.key for c in current.children]}"
)
return Qube.make_root(current.children)
return Qube.root_node(current.children)
elif isinstance(args, tuple) and len(args) == 2:
key, value = args
for c in self.children:
if c.key == key and value in c.values:
return Qube.make_root(c.children)
raise KeyError(f"Key '{key}' not found in children of {self.key}")
return Qube.root_node(c.children)
raise KeyError(f"Key {key} not found in children of {self.key}")
else:
raise ValueError(f"Unknown key type {args}")
raise ValueError("Unknown key type")
@cached_property
def n_leaves(self) -> int:
@ -452,7 +325,7 @@ class Qube:
return 0
return 1 + sum(c.n_nodes for c in self.children)
def transform(self, func: "Callable[[Qube], Qube | Iterable[Qube]]") -> Qube:
def transform(self, func: "Callable[[Qube], Qube | Iterable[Qube]]") -> "Qube":
"""
Call a function on every node of the Qube, return one or more nodes.
If multiple nodes are returned they each get a copy of the (transformed) children of the original node.
@ -470,29 +343,11 @@ class Qube:
children = tuple(cc for c in self.children for cc in transform(c))
return self.replace(children=children)
def remove_by_key(self, keys: str | list[str]):
_keys: list[str] = keys if isinstance(keys, list) else [keys]
def remove_key(node: Qube) -> Qube:
children: list[Qube] = []
for c in node.children:
if c.key in _keys:
grandchildren = tuple(sorted(remove_key(cc) for cc in c.children))
grandchildren = remove_key(Qube.make_root(grandchildren)).children
children.extend(grandchildren)
else:
children.append(remove_key(c))
return node.replace(children=tuple(sorted(children)))
return remove_key(self).compress()
def convert_dtypes(self, converters: dict[str, Callable[[Any], Any]]):
def convert(node: Qube) -> Qube:
if node.key in converters:
converter = converters[node.key]
values = [converter(v) for v in node.values]
new_node = node.replace(values=QEnum(values))
new_node = node.replace(values=QEnum(map(converter, node.values)))
return new_node
return node
@ -502,94 +357,57 @@ class Qube:
self,
selection: dict[str, str | list[str] | Callable[[Any], bool]],
mode: Literal["strict", "relaxed"] = "relaxed",
prune=True,
consume=False,
) -> Qube:
# Find any bare str values and replace them with [str]
_selection: dict[str, list[str] | Callable[[Any], bool]] = {}
for k, v in selection.items():
if isinstance(v, list):
_selection[k] = v
elif callable(v):
_selection[k] = v
else:
_selection[k] = [v]
) -> "Qube":
# make all values lists
selection: dict[str, list[str] | Callable[[Any], bool]] = {
k: v if isinstance(v, list | Callable) else [v]
for k, v in selection.items()
}
def not_none(xs):
return tuple(x for x in xs if x is not None)
def select(
node: Qube,
selection: dict[str, list[str] | Callable[[Any], bool]],
matched: bool,
) -> Qube | None:
def select(node: Qube, selection: dict[str, list[str]]) -> Qube | None:
# If this node has no children but there are still parts of the request
# that have not been consumed, then prune this whole branch
if consume and not node.children and selection:
return None
# If the key isn't in the selection then what we do depends on the mode:
# In strict mode we just stop here
# In next_level mode we include the next level down so you can tell what keys to add next
# In relaxed mode we skip the key if it't not in the request and carry on
# Check if the key is specified in the selection
if node.key not in selection:
if mode == "strict":
return None
elif mode == "next_level":
return node.replace(
children=(),
metadata=self.metadata
| {"is_leaf": np.array([not bool(node.children)])},
)
new_children = not_none(select(c, selection) for c in node.children)
elif mode == "relaxed":
pass
else:
raise ValueError(f"Unknown mode argument {mode}")
# If the key IS in the selection then check if the values match
if node.key in _selection:
# If the key is specified, check if any of the values match
selection_criteria = _selection[node.key]
if callable(selection_criteria):
values = QEnum((c for c in node.values if selection_criteria(c)))
elif isinstance(selection_criteria, list):
values = QEnum((c for c in selection_criteria if c in node.values))
else:
raise ValueError(f"Unknown selection type {selection_criteria}")
# Here modes don't matter because we've explicitly filtered on this key and found nothing
if not values:
# prune==true then remove any non-leaf nodes
# which have had all their children removed
if prune and node.children and not new_children:
return None
matched = True
node = node.replace(values=values)
return node.replace(children=new_children)
# If the key is specified, check if any of the values match
selection_criteria = selection[node.key]
if isinstance(selection_criteria, Callable):
values = QEnum((c for c in node.values if selection_criteria(c)))
else:
values = QEnum((c for c in selection[node.key] if c in node.values))
if not values:
return None
if consume:
selection = {k: v for k, v in selection.items() if k != node.key}
# Prune nodes that had had all their children pruned
new_children = not_none(
select(c, selection, matched) for c in node.children
)
if node.children and not new_children:
return None
metadata = dict(node.metadata)
if mode == "next_level":
metadata["is_leaf"] = np.array([not bool(node.children)])
return node.replace(
children=new_children,
metadata=metadata,
values=values,
children=not_none(select(c, selection) for c in node.children),
)
return self.replace(
children=not_none(
select(c, _selection, matched=False) for c in self.children
)
children=not_none(select(c, selection) for c in self.children)
)
def span(self, key: str) -> list[str]:
@ -611,26 +429,6 @@ class Qube:
axes[self.key].update(self.values)
return dict(axes)
def axes_info(self, depth=0) -> dict[str, AxisInfo]:
axes = defaultdict(
lambda: AxisInfo(key="", type=str, depths=set(), values=set())
)
for c in self.children:
for k, info in c.axes_info(depth=depth + 1).items():
axes[k].combine(info)
if self.key != "root":
axes[self.key].combine(
AxisInfo(
key=self.key,
type=type(next(iter(self.values))),
depths={depth},
values=set(self.values),
)
)
return dict(axes)
@cached_property
def structural_hash(self) -> int:
"""
@ -645,48 +443,12 @@ class Qube:
return hash_node(self)
def compress(self) -> Qube:
"""
This method is quite computationally heavy because of trees like this:
root, class=d1, generation=1
time=0600, many identical keys, param=8,78,79
time=0600, many identical keys, param=8,78,79
time=0600, many identical keys, param=8,78,79
This tree compresses dow n
def compress(self) -> "Qube":
# First compress the children (this recursively compresses all the way to the leaves)
new_children = [child.compress() for child in self.children]
"""
# Now compress the set of children at this level
new_children = set_operations.compress_children(new_children)
def union(a: Qube, b: Qube) -> Qube:
b = type(self).make_root(children=(b,))
out = set_operations.operation(
a, b, set_operations.SetOperation.UNION, type(self)
)
return out
new_children = [c.compress() for c in self.children]
if len(new_children) > 1:
new_children = list(
functools.reduce(union, new_children, Qube.empty()).children
)
return self.replace(children=tuple(sorted(new_children)))
def add_metadata(self, **kwargs: dict[str, Any]):
metadata = {
k: np.array(
[
v,
]
)
for k, v in kwargs.items()
}
return self.replace(metadata=metadata)
def strip_metadata(self) -> Qube:
def strip(node):
return node.replace(metadata=frozendict({}))
return self.transform(strip)
def display(self):
_display(self)
# Return the now compressed node
return Qube.make(self.key, self.values, new_children)

View File

@ -1,4 +1,3 @@
from . import protobuf
from .Qube import Qube
__all__ = ["Qube", "protobuf"]
__all__ = ["Qube"]

View File

@ -1,4 +1,3 @@
import json
import time
import click
@ -108,15 +107,7 @@ def convert(input, output, from_format, to_format):
n0 = q.n_leaves
t = time.time()
if to_format == "text":
output_content = str(q)
elif to_format == "json":
output_content = json.dumps(q.to_json())
elif to_format == "html":
output_content = q.html().html
else:
output_content = str(q)
output_content = str(q) if to_format == "text" else q.html().html
output.write(output_content)

View File

@ -8,10 +8,8 @@ def parse_key_value_pairs(text: str):
for segment in text.split(","):
if "=" not in segment:
print(segment)
key, values_str = segment.split(
"=", 1
) # Ensure split only happens at first "="
values = values_str.split("/")
key, values = segment.split("=", 1) # Ensure split only happens at first "="
values = values.split("/")
result[key] = values
return result

View File

@ -1,43 +0,0 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Iterator
import numpy as np
from .value_types import QEnum
if TYPE_CHECKING:
from .Qube import Qube
def make_node(
cls,
key: str,
values: Iterator,
shape: list[int],
children: tuple[Qube, ...],
metadata: dict[str, np.ndarray] | None = None,
):
return cls.make_node(
key=key,
values=QEnum(values),
metadata={k: np.array(v).reshape(shape) for k, v in metadata.items()}
if metadata is not None
else {},
children=children,
)
def from_nodes(cls, nodes, add_root=True):
shape = [len(n["values"]) for n in nodes.values()]
nodes = nodes.items()
*nodes, (key, info) = nodes
root = make_node(cls, shape=shape, children=(), key=key, **info)
for key, info in reversed(nodes):
shape.pop()
root = make_node(cls, shape=shape, children=(root,), key=key, **info)
if add_root:
return cls.make_root(children=(root,))
return root

View File

@ -0,0 +1,26 @@
from dataclasses import dataclass, field
from typing import Hashable
from frozendict import frozendict
from .value_types import ValueGroup
@dataclass(frozen=False, eq=True, order=True, unsafe_hash=True)
class NodeData:
key: str
values: ValueGroup
metadata: dict[str, tuple[Hashable, ...]] = field(
default_factory=frozendict, compare=False
)
def summary(self) -> str:
return f"{self.key}={self.values.summary()}" if self.key != "root" else "root"
@dataclass(frozen=False, eq=True, order=True)
class RootNodeData(NodeData):
"Helper class to print a custom root name"
def summary(self) -> str:
return self.key

View File

@ -1,109 +0,0 @@
from __future__ import annotations
import warnings
from typing import TYPE_CHECKING
import numpy as np
from frozendict import frozendict
from ..value_types import QEnum
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Protobuf gencode version",
UserWarning,
"google.protobuf.runtime_version",
)
from . import qube_pb2
if TYPE_CHECKING:
from ..Qube import Qube
def _ndarray_to_proto(arr: np.ndarray) -> qube_pb2.NdArray:
"""np.ndarray → NdArray message"""
return qube_pb2.NdArray(
shape=list(arr.shape),
dtype=str(arr.dtype),
raw=arr.tobytes(order="C"),
)
def _ndarray_from_proto(msg: qube_pb2.NdArray) -> np.ndarray:
"""NdArray message → np.ndarray (immutable view)"""
return np.frombuffer(msg.raw, dtype=msg.dtype).reshape(tuple(msg.shape))
def _py_to_valuegroup(value: list[str] | np.ndarray) -> qube_pb2.ValueGroup:
"""Accept str-sequence *or* ndarray and return ValueGroup."""
vg = qube_pb2.ValueGroup()
if isinstance(value, np.ndarray):
vg.tensor.CopyFrom(_ndarray_to_proto(value))
else:
vg.s.items.extend(value)
return vg
def _valuegroup_to_py(vg: qube_pb2.ValueGroup) -> list[str] | np.ndarray:
"""ValueGroup → list[str] *or* ndarray"""
arm = vg.WhichOneof("payload")
if arm == "tensor":
return _ndarray_from_proto(vg.tensor)
return QEnum(vg.s.items)
def _py_to_metadatagroup(value: np.ndarray) -> qube_pb2.MetadataGroup:
"""Accept str-sequence *or* ndarray and return ValueGroup."""
vg = qube_pb2.MetadataGroup()
if not isinstance(value, np.ndarray):
value = np.array([value])
vg.tensor.CopyFrom(_ndarray_to_proto(value))
return vg
def _metadatagroup_to_py(vg: qube_pb2.MetadataGroup) -> np.ndarray:
"""ValueGroup → list[str] *or* ndarray"""
arm = vg.WhichOneof("payload")
if arm == "tensor":
return _ndarray_from_proto(vg.tensor)
raise ValueError(f"Unknown arm {arm}")
def _qube_to_proto(q: Qube) -> qube_pb2.Qube:
"""Frozen Qube dataclass → protobuf Qube message (new object)."""
return qube_pb2.Qube(
key=q.key,
values=_py_to_valuegroup(q.values),
metadata={k: _py_to_metadatagroup(v) for k, v in q.metadata.items()},
children=[_qube_to_proto(c) for c in q.children],
is_root=q.is_root,
)
def qube_to_proto(q: Qube) -> bytes:
return _qube_to_proto(q).SerializeToString()
def _proto_to_qube(cls: type, msg: qube_pb2.Qube) -> Qube:
"""protobuf Qube message → frozen Qube dataclass (new object)."""
return cls.make_node(
key=msg.key,
values=_valuegroup_to_py(msg.values),
metadata=frozendict(
{k: _metadatagroup_to_py(v) for k, v in msg.metadata.items()}
),
children=tuple(_proto_to_qube(cls, c) for c in msg.children),
is_root=msg.is_root,
)
def proto_to_qube(cls: type, wire: bytes) -> Qube:
msg = qube_pb2.Qube()
msg.ParseFromString(wire)
return _proto_to_qube(cls, msg)

View File

@ -1,45 +0,0 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# NO CHECKED-IN PROTOBUF GENCODE
# source: qube.proto
# Protobuf Python Version: 5.29.0
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import runtime_version as _runtime_version
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
_runtime_version.ValidateProtobufRuntimeVersion(
_runtime_version.Domain.PUBLIC, 5, 29, 0, "", "qube.proto"
)
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\nqube.proto"4\n\x07NdArray\x12\r\n\x05shape\x18\x01 \x03(\x03\x12\r\n\x05\x64type\x18\x02 \x01(\t\x12\x0b\n\x03raw\x18\x03 \x01(\x0c"\x1c\n\x0bStringGroup\x12\r\n\x05items\x18\x01 \x03(\t"N\n\nValueGroup\x12\x19\n\x01s\x18\x01 \x01(\x0b\x32\x0c.StringGroupH\x00\x12\x1a\n\x06tensor\x18\x02 \x01(\x0b\x32\x08.NdArrayH\x00\x42\t\n\x07payload"6\n\rMetadataGroup\x12\x1a\n\x06tensor\x18\x01 \x01(\x0b\x32\x08.NdArrayH\x00\x42\t\n\x07payload"\xd1\x01\n\x04Qube\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1b\n\x06values\x18\x02 \x01(\x0b\x32\x0b.ValueGroup\x12%\n\x08metadata\x18\x03 \x03(\x0b\x32\x13.Qube.MetadataEntry\x12\r\n\x05\x64type\x18\x04 \x01(\t\x12\x17\n\x08\x63hildren\x18\x05 \x03(\x0b\x32\x05.Qube\x12\x0f\n\x07is_root\x18\x06 \x01(\x08\x1a?\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1d\n\x05value\x18\x02 \x01(\x0b\x32\x0e.MetadataGroup:\x02\x38\x01\x62\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "qube_pb2", _globals)
if not _descriptor._USE_C_DESCRIPTORS:
DESCRIPTOR._loaded_options = None
_globals["_QUBE_METADATAENTRY"]._loaded_options = None
_globals["_QUBE_METADATAENTRY"]._serialized_options = b"8\001"
_globals["_NDARRAY"]._serialized_start = 14
_globals["_NDARRAY"]._serialized_end = 66
_globals["_STRINGGROUP"]._serialized_start = 68
_globals["_STRINGGROUP"]._serialized_end = 96
_globals["_VALUEGROUP"]._serialized_start = 98
_globals["_VALUEGROUP"]._serialized_end = 176
_globals["_METADATAGROUP"]._serialized_start = 178
_globals["_METADATAGROUP"]._serialized_end = 232
_globals["_QUBE"]._serialized_start = 235
_globals["_QUBE"]._serialized_end = 444
_globals["_QUBE_METADATAENTRY"]._serialized_start = 381
_globals["_QUBE_METADATAENTRY"]._serialized_end = 444
# @@protoc_insertion_point(module_scope)

View File

@ -1,464 +1,170 @@
"""
# Set Operations
The core of this is the observation that for two sets A and B, if we compute (A - B), (A B) amd (B - A)
then we can get the other operations by taking unions of the above three objects.
Union: All of them
Intersection: Just take A B
Difference: Take either A - B or B - A
Symmetric Difference (XOR): Take A - B and B - A
We start with a shallow implementation of this algorithm that only deals with a pair of nodes, not the whole tree:
shallow_set_operation(A: Qube, B: Qube) -> SetOpsResult
This takes two qubes and (morally) returns (A - B), (A B) amd (B - A) but only for the values and metadata at the top level.
For technical reasons that will become clear we actually return a struct with two copies of (A B). One has the metadata from A and the children of A call it A', and the other has them from B call it B'. This is relevant when we extend the shallow algorithm to work with a whole tree because we will recurse and compute the set operation for each pair of the children of A' and B'.
NB: Currently there are two kinds of values, QEnums, that store a list of values and Wildcards that 'match with everything'. shallow_set_operation checks the type of values and dispatches to different methods depending on the combination of types it finds.
"""
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass
from dataclasses import replace
from enum import Enum
# Prevent circular imports while allowing the type checker to know what Qube is
from typing import TYPE_CHECKING, Any, Iterable
from typing import TYPE_CHECKING, Iterable
import numpy as np
from frozendict import frozendict
from .node_types import NodeData
from .value_types import QEnum, ValueGroup, WildcardGroup
if TYPE_CHECKING:
from .Qube import Qube
from .qube import Qube
class SetOperation(Enum):
"Map from set operations to which combination of (A - B), (A ∩ B), (B - A) we need."
UNION = (1, 1, 1)
INTERSECTION = (0, 1, 0)
DIFFERENCE = (1, 0, 0)
SYMMETRIC_DIFFERENCE = (1, 0, 1)
@dataclass(eq=True, frozen=True)
class ValuesIndices:
"Helper class to hold the values and indices from a node."
def node_intersection(
A: "ValueGroup", B: "ValueGroup"
) -> tuple[ValueGroup, ValueGroup, ValueGroup]:
if isinstance(A, QEnum) and isinstance(B, QEnum):
set_A, set_B = set(A), set(B)
intersection = set_A & set_B
just_A = set_A - intersection
just_B = set_B - intersection
return QEnum(just_A), QEnum(intersection), QEnum(just_B)
values: ValueGroup
indices: tuple[int, ...]
if isinstance(A, WildcardGroup) and isinstance(B, WildcardGroup):
return A, WildcardGroup(), B
@classmethod
def from_values(cls, values: ValueGroup):
return cls(values=values, indices=tuple(range(len(values))))
# If A is a wildcard matcher then the intersection is everything
# just_A is still *
# just_B is empty
if isinstance(A, WildcardGroup):
return A, B, QEnum([])
@classmethod
def empty(cls):
return cls(values=QEnum([]), indices=())
def enumerate(self) -> Iterable[tuple[Any, int]]:
return zip(self.indices, self.values)
def get_indices(
metadata: frozendict[str, np.ndarray], indices: tuple[int, ...]
) -> frozendict[str, np.ndarray]:
"Given a metadata dict and some indices, return a new metadata dict with only the values indexed by the indices"
return frozendict(
{k: v[..., indices] for k, v in metadata.items() if isinstance(v, np.ndarray)}
)
@dataclass(eq=True, frozen=True)
class SetOpResult:
"""
Given two sets A and B, all possible set operations can be constructed from A - B, A B, B - A
That is, what's only in A, the intersection and what's only in B
However because we need to recurse on children we actually return two intersection node:
only_A is a qube with:
The values in A but not in B
The metadata corresponding to this values
All the children A had
intersection_A is a qube with:
The values that intersected with B
The metadata from that intersection
All the children A had
And vice versa for only_B and intersection B
"""
only_A: ValuesIndices
intersection_A: ValuesIndices
intersection_B: ValuesIndices
only_B: ValuesIndices
def shallow_qenum_set_operation(A: ValuesIndices, B: ValuesIndices) -> SetOpResult:
"""
For two sets of values, partition the overlap into four groups:
only_A: values and indices of values that are in A but not B
intersection_A: values and indices of values that are in both A and B
And vice versa for only_B and intersection_B.
Note that intersection_A and intersection_B contain the same values but the indices are different.
"""
# create four groups that map value -> index
only_A: dict[Any, int] = {val: i for i, val in A.enumerate()}
only_B: dict[Any, int] = {val: i for i, val in B.enumerate()}
intersection_A: dict[Any, int] = {}
intersection_B: dict[Any, int] = {}
# Go through all the values and move any that are in the intersection
# to the corresponding group, keeping the indices
for val in A.values:
if val in B.values:
intersection_A[val] = only_A.pop(val)
intersection_B[val] = only_B.pop(val)
def package(values_indices: dict[Any, int]) -> ValuesIndices:
return ValuesIndices(
values=QEnum(list(values_indices.keys())),
indices=tuple(values_indices.values()),
)
return SetOpResult(
only_A=package(only_A),
only_B=package(only_B),
intersection_A=package(intersection_A),
intersection_B=package(intersection_B),
)
def shallow_wildcard_set_operation(A: ValuesIndices, B: ValuesIndices) -> SetOpResult:
"""
WildcardGroups behave as if they contain all the values of whatever they match against.
For two wildcards we just return both.
For A == wildcard and B == enum we have to be more careful:
1. All of B is in the intersection so only_B is None too.
2. The wildcard may need to match against other things so only_A is A
3. We return B in the intersection_B and intersection_A slot.
This last bit happens because the wildcard basically adopts the values of whatever it sees.
"""
# Two wildcard groups have full overlap.
if isinstance(A.values, WildcardGroup) and isinstance(B.values, WildcardGroup):
return SetOpResult(ValuesIndices.empty(), A, B, ValuesIndices.empty())
# If A is a wildcard matcher and B is not
# then the intersection is everything from B
if isinstance(A.values, WildcardGroup):
return SetOpResult(A, B, B, ValuesIndices.empty())
# If B is a wildcard matcher and A is not
# then the intersection is everything from A
if isinstance(B.values, WildcardGroup):
return SetOpResult(ValuesIndices.empty(), A, A, B)
# The reverse if B is a wildcard
if isinstance(B, WildcardGroup):
return QEnum([]), A, B
raise NotImplementedError(
f"One of {type(A.values)} and {type(B.values)} should be WildCardGroup"
f"Fused set operations on values types {type(A)} and {type(B)} not yet implemented"
)
def shallow_set_operation(
A: ValuesIndices,
B: ValuesIndices,
) -> SetOpResult:
if isinstance(A.values, QEnum) and isinstance(B.values, QEnum):
return shallow_qenum_set_operation(A, B)
# WildcardGroups behave as if they contain all possible values.
if isinstance(A.values, WildcardGroup) or isinstance(B.values, WildcardGroup):
return shallow_wildcard_set_operation(A, B)
raise NotImplementedError(
f"Set operations on values types {type(A.values)} and {type(B.values)} not yet implemented"
)
def operation(
A: Qube, B: Qube, operation_type: SetOperation, node_type, depth=0
) -> Qube | None:
# print(f"operation({A}, {B})")
def operation(A: "Qube", B: "Qube", operation_type: SetOperation, node_type) -> "Qube":
assert A.key == B.key, (
"The two Qube root nodes must have the same key to perform set operations,"
f"would usually be two root nodes. They have {A.key} and {B.key} respectively"
)
node_key = A.key
assert A.is_root == B.is_root
is_root = A.is_root
assert A.values == B.values, (
f"The two Qube root nodes must have the same values to perform set operations {A.values = }, {B.values = }"
)
node_values = A.values
# Group the children of the two nodes by key
nodes_by_key: defaultdict[str, tuple[list[Qube], list[Qube]]] = defaultdict(
lambda: ([], [])
)
new_children: list[Qube] = []
# Sort out metadata into what can stay at this level and what must move down
stayput_metadata: dict[str, np.ndarray] = {}
pushdown_metadata_A: dict[str, np.ndarray] = {}
pushdown_metadata_B: dict[str, np.ndarray] = {}
for key in set(A.metadata.keys()) | set(B.metadata.keys()):
if key not in A.metadata:
pushdown_metadata_B[key] = B.metadata[key]
continue
if key not in B.metadata:
pushdown_metadata_A[key] = A.metadata[key]
continue
A_val = A.metadata[key]
B_val = B.metadata[key]
if np.allclose(A_val, B_val):
# print(f"{' ' * depth}Keeping metadata key '{key}' at this level")
stayput_metadata[key] = A.metadata[key]
else:
# print(f"{' ' * depth}Pushing down metadata key '{key}' {A_val} {B_val}")
pushdown_metadata_A[key] = A_val
pushdown_metadata_B[key] = B_val
# Add all the metadata that needs to be pushed down to the child nodes
# When pushing down the metadata we need to account for the fact it now affects more values
# So expand the metadata entries from shape (a, b, ..., c) to (a, b, ..., c, d)
# where d is the length of the node values
nodes_by_key = defaultdict(lambda: ([], []))
for node in A.children:
N = len(node.values)
meta = {
k: np.broadcast_to(v[..., np.newaxis], v.shape + (N,))
for k, v in pushdown_metadata_A.items()
}
node = node.replace(metadata=node.metadata | meta)
nodes_by_key[node.key][0].append(node)
for node in B.children:
N = len(node.values)
meta = {
k: np.broadcast_to(v[..., np.newaxis], v.shape + (N,))
for k, v in pushdown_metadata_B.items()
}
node = node.replace(metadata=node.metadata | meta)
nodes_by_key[node.key][1].append(node)
# print(f"{nodes_by_key = }")
new_children = []
# For every node group, perform the set operation
for key, (A_nodes, B_nodes) in nodes_by_key.items():
output = list(
_operation(A_nodes, B_nodes, operation_type, node_type, depth + 1)
new_children.extend(
_operation(key, A_nodes, B_nodes, operation_type, node_type)
)
# print(f"{' '*depth}_operation {operation_type.name} {A_nodes} {B_nodes} out = [{output}]")
new_children.extend(output)
# print(f"{' '*depth}operation {operation_type.name} [{A}] [{B}] new_children = [{new_children}]")
# If there are now no children as a result of the operation, return nothing.
if (A.children or B.children) and not new_children:
if A.key == "root":
return node_type.make_root(children=())
else:
return None
# Whenever we modify children we should recompress them
# But since `operation` is already recursive, we only need to compress this level not all levels
# Hence we use the non-recursive _compress method
new_children = list(compress_children(new_children))
new_children = compress_children(new_children)
# The values and key are the same so we just replace the children
if A.key == "root":
return node_type.make_root(
children=new_children,
metadata=stayput_metadata,
)
return node_type.make_node(
key=node_key,
values=node_values,
children=new_children,
metadata=stayput_metadata,
is_root=is_root,
)
return replace(A, children=new_children)
# The root node is special so we need a helper method that we can recurse on
def _operation(
A: list[Qube],
B: list[Qube],
operation_type: SetOperation,
node_type,
depth: int,
) -> Iterable[Qube]:
"""
This operation assumes that we've found two nodes that match and now want to do a set operation on their children. Hence we take in two lists of child nodes all of which have the same key but different values.
We then loop over all pairs of children from each list and compute the intersection.
"""
# print(f"_operation({A}, {B})")
keep_only_A, keep_intersection, keep_only_B = operation_type.value
key: str, A: list["Qube"], B: list["Qube"], operation_type: SetOperation, node_type
) -> Iterable["Qube"]:
keep_just_A, keep_intersection, keep_just_B = operation_type.value
# We're going to progressively remove values from the starting nodes as we do intersections
# So we make a node -> ValuesIndices mapping here for both a and b
only_a: dict[Qube, ValuesIndices] = {
n: ValuesIndices.from_values(n.values) for n in A
}
only_b: dict[Qube, ValuesIndices] = {
n: ValuesIndices.from_values(n.values) for n in B
}
# Iterate over all pairs (node_A, node_B)
values = {}
for node in A + B:
values[node] = node.values
def make_new_node(source: Qube, values_indices: ValuesIndices):
return source.replace(
values=values_indices.values,
metadata=get_indices(source.metadata, values_indices.indices),
)
# Iterate over all pairs (node_A, node_B) and perform the shallow set operation
# Update our copy of the original node to remove anything that appears in an intersection
for node_a in A:
for node_b in B:
set_ops_result = shallow_set_operation(only_a[node_a], only_b[node_b])
# Compute A - B, A & B, B - A
# Update the values for the two source nodes to remove the intersection
just_a, intersection, just_b = node_intersection(
values[node_a],
values[node_b],
)
# Save reduced values back to nodes
only_a[node_a] = set_ops_result.only_A
only_b[node_b] = set_ops_result.only_B
# Remove the intersection from the source nodes
values[node_a] = just_a
values[node_b] = just_b
if (
set_ops_result.intersection_A.values
and set_ops_result.intersection_B.values
):
result = operation(
make_new_node(node_a, set_ops_result.intersection_A),
make_new_node(node_b, set_ops_result.intersection_B),
operation_type,
node_type,
depth=depth + 1,
)
if result is not None:
# If we're doing a difference or xor we might want to throw away the intersection
# However we can only do this once we get to the leaf nodes, otherwise we'll
# throw away nodes too early!
# Consider Qube(root, a=1, b=1/2) - Qube(root, a=1, b=1)
# We can easily throw away the whole a node by accident here!
if keep_intersection or result.children:
yield result
elif (
not set_ops_result.intersection_A.values
and not set_ops_result.intersection_B.values
):
continue
else:
raise ValueError(
f"Only one of set_ops_result.intersection_A and set_ops_result.intersection_B is None, I didn't think that could happen! {set_ops_result = }"
)
if keep_intersection:
if intersection:
new_node_a = replace(
node_a, data=replace(node_a.data, values=intersection)
)
new_node_b = replace(
node_b, data=replace(node_b.data, values=intersection)
)
yield operation(new_node_a, new_node_b, operation_type, node_type)
if keep_only_A:
for node, vi in only_a.items():
if vi.values:
yield make_new_node(node, vi)
if keep_only_B:
for node, vi in only_b.items():
if vi.values:
yield make_new_node(node, vi)
# Now we've removed all the intersections we can yield the just_A and just_B parts if needed
if keep_just_A:
for node in A:
if values[node]:
yield node_type.make(key, values[node], node.children)
if keep_just_B:
for node in B:
if values[node]:
yield node_type.make(key, values[node], node.children)
def compress_children(children: Iterable[Qube], depth=0) -> tuple[Qube, ...]:
def compress_children(children: Iterable["Qube"]) -> tuple["Qube"]:
"""
Helper method tht only compresses a set of nodes, and doesn't do it recursively.
Used in Qubed.compress but also to maintain compression in the set operations above.
"""
# Take the set of new children and see if any have identical key, metadata and children
# Now take the set of new children and see if any have identical key, metadata and children
# the values may different and will be collapsed into a single node
identical_children = defaultdict(list)
identical_children = defaultdict(set)
for child in children:
# only care about the key and children of each node, ignore values
h = hash((child.key, tuple((cc.structural_hash for cc in child.children))))
identical_children[h].append(child)
key = hash((child.key, tuple((cc.structural_hash for cc in child.children))))
identical_children[key].add(child)
# Now go through and create new compressed nodes for any groups that need collapsing
new_children = []
for child_list in identical_children.values():
# If the group is size one just keep it
if len(child_list) == 1:
new_child = child_list.pop()
for child_set in identical_children.values():
if len(child_set) > 1:
child_set = list(child_set)
node_type = type(child_set[0])
key = child_set[0].key
# Compress the children into a single node
assert all(isinstance(child.data.values, QEnum) for child in child_set), (
"All children must have QEnum values"
)
node_data = NodeData(
key=key,
metadata=frozendict(), # Todo: Implement metadata compression
values=QEnum(
(v for child in child_set for v in child.data.values.values)
),
)
new_child = node_type(data=node_data, children=child_set[0].children)
else:
example = child_list[0]
node_type = type(example)
value_type = type(example.values)
assert all(isinstance(child.values, value_type) for child in child_list), (
f"All nodes to be grouped must have the same value type, expected {value_type}"
)
# We know the children of this group of nodes all have the same structure
# but we still need to merge the metadata across them
# children = example.children
children = merge_metadata(child_list, example.depth)
# Do we need to recusively compress here?
# children = compress_children(children, depth=depth+1)
if value_type is QEnum:
values = QEnum(set(v for child in child_list for v in child.values))
elif value_type is WildcardGroup:
values = example.values
else:
raise ValueError(f"Unknown value type: {value_type}")
new_child = node_type.make_node(
key=example.key,
metadata=example.metadata,
values=values,
children=children,
)
# If the group is size one just keep it
new_child = child_set.pop()
new_children.append(new_child)
return tuple(sorted(new_children, key=lambda n: ((n.key, n.values.min()))))
def merge_metadata(qubes: list[Qube], axis) -> Iterable[Qube]:
"""
Given a list of qubes with identical structure,
match up the children of each node and merge the metadata
"""
# Group the children of each qube and merge them
# Exploit the fact that they have the same shape and ordering
example = qubes[0]
node_type = type(example)
for i in range(len(example.children)):
group = [q.children[i] for q in qubes]
group_example = group[0]
assert len(set((c.structural_hash for c in group))) == 1
# Collect metadata by key
metadata_groups = {
k: [q.metadata[k] for q in group] for k in group_example.metadata.keys()
}
# Concatenate the metadata together
metadata: frozendict[str, np.ndarray] = frozendict(
{
k: np.concatenate(metadata_group, axis=axis)
for k, metadata_group in metadata_groups.items()
}
)
group_children = merge_metadata(group, axis)
yield node_type.make_node(
key=group_example.key,
metadata=metadata,
values=group_example.values,
children=group_children,
)

View File

@ -1,16 +1,16 @@
from __future__ import annotations
import random
from dataclasses import dataclass
from typing import TYPE_CHECKING, Callable, Iterable
from typing import Iterable, Protocol, Sequence, runtime_checkable
try:
from IPython.display import display
except ImportError:
display = None
if TYPE_CHECKING:
from .Qube import Qube
@runtime_checkable
class TreeLike(Protocol):
@property
def children(
self,
) -> Sequence["TreeLike"]: ... # Supports indexing like node.children[i]
def summary(self) -> str: ...
@dataclass(frozen=True)
@ -22,8 +22,8 @@ class HTML:
def summarize_node(
node: Qube, collapse=False, max_summary_length=50, **kwargs
) -> tuple[str, str, Qube]:
node: TreeLike, collapse=False, **kwargs
) -> tuple[str, str, TreeLike]:
"""
Extracts a summarized representation of the node while collapsing single-child paths.
Returns the summary string and the last node in the chain that has multiple children.
@ -33,10 +33,9 @@ def summarize_node(
while True:
summary = node.summary(**kwargs)
paths.append(summary)
if len(summary) > max_summary_length:
summary = summary[:max_summary_length] + "..."
if len(summary) > 50:
summary = summary[:50] + "..."
summaries.append(summary)
if not collapse:
break
@ -46,14 +45,10 @@ def summarize_node(
break
node = node.children[0]
# Add a "..." to represent nodes that we don't know about
if (not node.children) and (not node.is_leaf):
summaries.append("...")
return ", ".join(summaries), ",".join(paths), node
def node_tree_to_string(node: Qube, prefix: str = "", depth=None) -> Iterable[str]:
def node_tree_to_string(node: TreeLike, prefix: str = "", depth=None) -> Iterable[str]:
summary, path, node = summarize_node(node)
if depth is not None and depth <= 0:
@ -76,73 +71,17 @@ def node_tree_to_string(node: Qube, prefix: str = "", depth=None) -> Iterable[st
)
def summarize_node_html(
node: Qube,
collapse=False,
max_summary_length=50,
info: Callable[[Qube], str] | None = None,
**kwargs,
) -> tuple[str, Qube]:
"""
Extracts a summarized representation of the node while collapsing single-child paths.
Returns the summary string and the last node in the chain that has multiple children.
"""
if info is None:
def info_func(node: Qube, /):
return (
# f"dtype: {node.dtype}\n"
f"metadata: {dict(node.metadata)}\n"
)
else:
info_func = info
summaries = []
while True:
path = node.summary(**kwargs)
summary = path
if len(summary) > max_summary_length:
summary = summary[:max_summary_length] + "..."
info_string = info_func(node)
summary = f'<span class="qubed-node" data-path="{path}" title="{info_string}">{summary}</span>'
summaries.append(summary)
if not collapse:
break
# Move down if there's exactly one child, otherwise stop
if len(node.children) != 1:
break
node = node.children[0]
if (not node.children) and (not node.is_leaf):
summary = (
'<span class="qubed-node" data-path="" title="Truncated Nodes">...</span>'
)
summaries.append(summary)
return ", ".join(summaries), node
def _node_tree_to_html(
node: Qube,
prefix: str = "",
depth=1,
connector="",
info: Callable[[Qube], str] | None = None,
**kwargs,
node: TreeLike, prefix: str = "", depth=1, connector="", **kwargs
) -> Iterable[str]:
summary, node = summarize_node_html(node, info=info, **kwargs)
summary, path, node = summarize_node(node, **kwargs)
if len(node.children) == 0:
yield f'<span class="qubed-level">{connector}{summary}</span>'
yield f'<span class="qubed-node leaf" data-path="{path}">{connector}{summary}</span>'
return
else:
open = "open" if depth > 0 else ""
yield f'<details {open}><summary class="qubed-level">{connector}{summary}</summary>'
yield f'<details {open} data-path="{path}"><summary class="qubed-node">{connector}{summary}</summary>'
for index, child in enumerate(node.children):
connector = "└── " if index == len(node.children) - 1 else "├── "
@ -152,23 +91,13 @@ def _node_tree_to_html(
prefix + extension,
depth=depth - 1,
connector=prefix + connector,
info=info,
**kwargs,
)
yield "</details>"
def node_tree_to_html(
node: Qube,
depth=1,
include_css=True,
include_js=True,
css_id=None,
info: Callable[[Qube], str] | None = None,
**kwargs,
) -> str:
if css_id is None:
css_id = f"qubed-tree-{random.randint(0, 1000000)}"
def node_tree_to_html(node: TreeLike, depth=1, **kwargs) -> str:
css_id = f"qubed-tree-{random.randint(0, 1000000)}"
# It's ugle to use an f string here because css uses {} so much so instead
# we use CSS_ID as a placeholder and replace it later
@ -185,7 +114,7 @@ def node_tree_to_html(
margin-left: 0;
}
.qubed-level a {
.qubed-node a {
margin-left: 10px;
text-decoration: none;
}
@ -199,7 +128,7 @@ def node_tree_to_html(
display: block;
}
span.qubed-node:hover {
summary:hover,span.leaf:hover {
background-color: #f0f0f0;
}
@ -211,7 +140,7 @@ def node_tree_to_html(
content: "";
}
.qubed-level {
.leaf {
text-overflow: ellipsis;
overflow: hidden;
text-wrap: nowrap;
@ -246,26 +175,9 @@ def node_tree_to_html(
await navigator.clipboard.writeText(path);
}
const nodes = document.querySelectorAll("#CSS_ID.qubed-node");
const nodes = document.querySelectorAll("#CSS_ID .qubed-node");
nodes.forEach(n => n.addEventListener("click", nodeOnClick));
</script>
""".replace("CSS_ID", css_id)
nodes = "".join(_node_tree_to_html(node=node, depth=depth, info=info, **kwargs))
return f"{js if include_js else ''}{css if include_css else ''}<pre class='qubed-tree' id='{css_id}'>{nodes}</pre>"
def _display(qube: Qube, **kwargs):
if display is None:
print(qube)
else:
def info(node: Qube):
return f"""\
structural_hash = {node.structural_hash}
metadata = {dict(node.metadata)}
is_root = {node.is_root}
is_leaf = {node.is_leaf}
"""
kwargs = {"info": info} | kwargs
display(qube.html(**kwargs))
nodes = "".join(_node_tree_to_html(node=node, depth=depth, **kwargs))
return f"{js}{css}<pre class='qubed-tree' id='{css_id}'>{nodes}</pre>"

View File

@ -1,19 +1,8 @@
from __future__ import annotations
import dataclasses
from abc import ABC, abstractmethod
from dataclasses import dataclass
from dataclasses import dataclass, replace
from datetime import date, datetime, timedelta
from typing import (
TYPE_CHECKING,
Any,
FrozenSet,
Iterable,
Iterator,
Literal,
Sequence,
TypeVar,
)
from typing import TYPE_CHECKING, Any, FrozenSet, Iterable, Literal, TypeVar
if TYPE_CHECKING:
from .Qube import Qube
@ -21,11 +10,6 @@ if TYPE_CHECKING:
@dataclass(frozen=True)
class ValueGroup(ABC):
@abstractmethod
def dtype(self) -> str:
"Provide a string rep of the datatype of these values"
pass
@abstractmethod
def summary(self) -> str:
"Provide a string summary of the value group."
@ -46,57 +30,41 @@ class ValueGroup(ABC):
"Return the minimum value in the group."
pass
@classmethod
@dataclass(frozen=True)
class FiniteValueGroup(ValueGroup, ABC):
@abstractmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[ValueGroup]:
"Given a list of strings, return a one or more ValueGroups of this type."
def __len__(self) -> int:
"Return how many values this group contains."
pass
@abstractmethod
def __iter__(self) -> Iterator:
def __iter__(self) -> Iterable[Any]:
"Iterate over the values in the group."
pass
@classmethod
@abstractmethod
def __len__(self) -> int:
def from_strings(cls, values: Iterable[str]) -> list["ValueGroup"]:
"Given a list of strings, return a one or more ValueGroups of this type."
pass
T = TypeVar("T")
EnumValuesType = FrozenSet[T]
_dtype_map: dict[str, type] = {
"str": str,
"int64": int,
"float64": float,
"date": datetime,
}
_dtype_map_inv: dict[type, str] = {v: k for k, v in _dtype_map.items()}
_dtype_formatters = {
"str": str,
"int64": int,
"float64": float,
"date": datetime.fromisoformat,
}
@dataclass(frozen=True, order=True)
class QEnum(ValueGroup):
class QEnum(FiniteValueGroup):
"""
The simplest kind of key value is just a list of strings.
summary -> string1/string2/string....
"""
values: EnumValuesType
_dtype: str = "str"
def __init__(self, obj, dtype="str"):
object.__setattr__(self, "values", tuple(sorted(obj)))
object.__setattr__(
self,
"_dtype",
dtype,
)
def __init__(self, obj):
object.__setattr__(self, "values", frozenset(obj))
def __post_init__(self):
assert isinstance(self.values, tuple)
@ -113,29 +81,14 @@ class QEnum(ValueGroup):
def __contains__(self, value: Any) -> bool:
return value in self.values
def dtype(self):
return self._dtype
@classmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[ValueGroup]:
return [cls(tuple(values))]
def from_strings(self, values: Iterable[str]) -> list["ValueGroup"]:
return [type(self)(tuple(values))]
def min(self):
return min(self.values)
def to_json(self):
return {"type": "enum", "dtype": self.dtype(), "values": self.values}
# @classmethod
# def from_json(cls, type: Literal["enum"], dtype: str, values: list):
# dtype_formatter = _dtype_formatters[dtype]
@classmethod
def from_list(cls, obj):
example = obj[0]
dtype = type(example)
assert [type(v) is dtype for v in obj]
return cls(obj, dtype=_dtype_map_inv[dtype])
return list(self.values)
@dataclass(frozen=True, order=True)
@ -152,22 +105,6 @@ class WildcardGroup(ValueGroup):
def min(self):
return "*"
def __len__(self):
return 1
def __iter__(self):
return ["*"]
def __bool__(self):
return True
def dtype(self):
return "*"
@classmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[ValueGroup]:
return [WildcardGroup()]
class DateEnum(QEnum):
def summary(self) -> str:
@ -188,7 +125,7 @@ class Range(ValueGroup, ABC):
def min(self):
return self.start
def __iter__(self) -> Iterator[Any]:
def __iter__(self) -> Iterable[Any]:
i = self.start
while i <= self.end:
yield i
@ -208,19 +145,19 @@ class DateRange(Range):
def __len__(self) -> int:
return (self.end - self.start) // self.step
def __iter__(self) -> Iterator[date]:
def __iter__(self) -> Iterable[date]:
current = self.start
while current <= self.end if self.step.days > 0 else current >= self.end:
yield current
current += self.step
@classmethod
def from_strings(cls, values: Iterable[str]) -> Sequence[DateRange | DateEnum]:
def from_strings(cls, values: Iterable[str]) -> "list[DateRange | QEnum]":
dates = sorted([datetime.strptime(v, "%Y%m%d") for v in values])
if len(dates) < 2:
return [DateEnum(dates)]
ranges: list[DateEnum | DateRange] = []
ranges = []
current_group, dates = (
[
dates[0],
@ -306,7 +243,7 @@ class TimeRange(Range):
def min(self):
return self.start
def __iter__(self) -> Iterator[Any]:
def __iter__(self) -> Iterable[Any]:
return super().__iter__()
@classmethod
@ -416,19 +353,23 @@ class IntRange(Range):
return ranges
def values_from_json(obj: dict | list) -> ValueGroup:
def values_from_json(obj) -> ValueGroup:
if isinstance(obj, list):
return QEnum.from_list(obj)
return QEnum(tuple(obj))
match obj["type"]:
case "enum":
QEnum.from_json(**obj)
match obj["dtype"]:
case "date":
return DateRange(**obj)
case "time":
return TimeRange(**obj)
case "int":
return IntRange(**obj)
case _:
raise ValueError(f"Unknown dtype {obj['dtype']}")
def convert_datatypes(q: "Qube", conversions: dict[str, ValueGroup]) -> "Qube":
def _convert(q: "Qube") -> Iterator["Qube"]:
def _convert(q: "Qube") -> Iterable["Qube"]:
if q.key in conversions:
data_type = conversions[q.key]
assert isinstance(q.values, QEnum), (
@ -436,7 +377,7 @@ def convert_datatypes(q: "Qube", conversions: dict[str, ValueGroup]) -> "Qube":
)
for values_group in data_type.from_strings(q.values):
# print(values_group)
yield q.replace(values=values_group)
yield replace(q, data=replace(q.data, values=values_group))
else:
yield q

View File

@ -1,32 +0,0 @@
syntax = "proto3";
message NdArray {
repeated int64 shape = 1;
string dtype = 2;
bytes raw = 3;
}
message StringGroup {repeated string items = 1; }
// Stores values i.e class=1/2/3 the 1/2/3 part
message ValueGroup {
oneof payload {
StringGroup s = 1;
NdArray tensor = 2;
}
}
message MetadataGroup {
oneof payload {
NdArray tensor = 1;
}
}
message Qube {
string key = 1;
ValueGroup values = 2;
map<string, MetadataGroup> metadata = 3;
string dtype = 4;
repeated Qube children = 5;
bool is_root = 6;
}

334
src/rust/compressed_tree.rs Normal file
View File

@ -0,0 +1,334 @@
#![allow(dead_code)]
use std::rc::Rc;
use smallstr::SmallString;
use slotmap::{new_key_type, SlotMap};
new_key_type! {
struct NodeId;
}
type CompactString = SmallString<[u8; 16]>;
#[derive(Clone)]
enum NodeValueTypes {
String(CompactString),
Int(i32),
}
impl From<&str> for NodeValueTypes {
fn from(s: &str) -> Self {
NodeValueTypes::String(CompactString::from(s))
}
}
impl From<i32> for NodeValueTypes {
fn from(i: i32) -> Self {
NodeValueTypes::Int(i)
}
}
enum NodeValue {
Single(NodeValueTypes),
Multiple(Vec<NodeValueTypes>),
}
struct Node<Payload> {
key: Rc<String>,
value: NodeValue,
parent: Option<NodeId>,
prev_sibling: Option<NodeId>,
next_sibling: Option<NodeId>,
// vector may be faster for traversal, but linkedlist should be faster for insertion
children: Option<(NodeId, NodeId)>, // (first_child, last_child)
data: Option<Payload>,
}
struct QueryTree<Payload> {
nodes: SlotMap<NodeId, Node<Payload>>,
}
impl<Payload> QueryTree<Payload> {
fn new() -> Self {
QueryTree {
nodes: SlotMap::with_key(),
}
}
// Adds a node with a key and single value
fn add_node<S>(&mut self, key: &Rc<String>, value: S, parent: Option<NodeId>) -> NodeId
where
S: Into<NodeValueTypes>,
{
let node_id = self.nodes.insert_with_key(|_| Node {
key: Rc::clone(key),
value: NodeValue::Single(value.into()),
parent,
prev_sibling: None,
next_sibling: None,
children: None,
data: None,
});
if let Some(parent_id) = parent {
// Determine if parent has existing children
if let Some((first_child_id, last_child_id)) = self.nodes[parent_id].children {
// Update the last child's `next_sibling`
{
let last_child = &mut self.nodes[last_child_id];
last_child.next_sibling = Some(node_id);
}
// Update the new node's `prev_sibling`
{
let new_node = &mut self.nodes[node_id];
new_node.prev_sibling = Some(last_child_id);
}
// Update parent's last child
let parent_node = &mut self.nodes[parent_id];
parent_node.children = Some((first_child_id, node_id));
} else {
// No existing children
let parent_node = &mut self.nodes[parent_id];
parent_node.children = Some((node_id, node_id));
}
}
node_id
}
// Add a single value to a node
fn add_value<S>(&mut self, node_id: NodeId, value: S)
where
S: Into<NodeValueTypes>,
{
if let Some(node) = self.nodes.get_mut(node_id) {
match &mut node.value {
NodeValue::Single(v) => {
let values = vec![v.clone(), value.into()];
node.value = NodeValue::Multiple(values);
}
NodeValue::Multiple(values) => {
values.push(value.into());
}
}
}
}
// Add multiple values to a node
fn add_values<S>(&mut self, node_id: NodeId, values: Vec<S>)
where
S: Into<NodeValueTypes>,
{
if let Some(node) = self.nodes.get_mut(node_id) {
match &mut node.value {
NodeValue::Single(v) => {
let mut new_values = vec![v.clone()];
new_values.extend(values.into_iter().map(|v| v.into()));
node.value = NodeValue::Multiple(new_values);
}
NodeValue::Multiple(existing_values) => {
existing_values.extend(values.into_iter().map(|v| v.into()));
}
}
}
}
fn get_node(&self, node_id: NodeId) -> Option<&Node<Payload>> {
self.nodes.get(node_id)
}
// TODO: better if this returns an iterator?
fn get_children(&self, node_id: NodeId) -> Vec<NodeId> {
let mut children = Vec::new();
if let Some(node) = self.get_node(node_id) {
if let Some((first_child_id, _)) = node.children {
let mut current_id = Some(first_child_id);
while let Some(cid) = current_id {
children.push(cid);
current_id = self.nodes[cid].next_sibling;
}
}
}
children
}
fn remove_node(&mut self, node_id: NodeId) {
// Remove the node and update parent and siblings
if let Some(node) = self.nodes.remove(node_id) {
// Update parent's children
if let Some(parent_id) = node.parent {
let parent_node = self.nodes.get_mut(parent_id).unwrap();
if let Some((first_child_id, last_child_id)) = parent_node.children {
if first_child_id == node_id && last_child_id == node_id {
// Node was the only child
parent_node.children = None;
} else if first_child_id == node_id {
// Node was the first child
parent_node.children = Some((node.next_sibling.unwrap(), last_child_id));
} else if last_child_id == node_id {
// Node was the last child
parent_node.children = Some((first_child_id, node.prev_sibling.unwrap()));
}
}
}
// Update siblings
if let Some(prev_id) = node.prev_sibling {
self.nodes[prev_id].next_sibling = node.next_sibling;
}
if let Some(next_id) = node.next_sibling {
self.nodes[next_id].prev_sibling = node.prev_sibling;
}
// Recursively remove children
let children_ids = self.get_children(node_id);
for child_id in children_ids {
self.remove_node(child_id);
}
}
}
fn is_root(&self, node_id: NodeId) -> bool {
self.nodes[node_id].parent.is_none()
}
fn is_leaf(&self, node_id: NodeId) -> bool {
self.nodes[node_id].children.is_none()
}
fn add_payload(&mut self, node_id: NodeId, payload: Payload) {
if let Some(node) = self.nodes.get_mut(node_id) {
node.data = Some(payload);
}
}
fn print_tree(&self) {
// Find all root nodes (nodes without a parent)
let roots: Vec<NodeId> = self
.nodes
.iter()
.filter_map(|(id, node)| {
if node.parent.is_none() {
Some(id)
} else {
None
}
})
.collect();
// Iterate through each root node and print its subtree
for (i, root_id) in roots.iter().enumerate() {
let is_last = i == roots.len() - 1;
self.print_node(*root_id, String::new(), is_last);
}
}
/// Recursively prints a node and its children.
///
/// - `node_id`: The current node's ID.
/// - `prefix`: The string prefix for indentation and branch lines.
/// - `is_last`: Boolean indicating if the node is the last child of its parent.
fn print_node(&self, node_id: NodeId, prefix: String, is_last: bool) {
// Retrieve the current node
let node = match self.nodes.get(node_id) {
Some(n) => n,
None => return, // Node not found; skip
};
// Determine the branch character
let branch = if prefix.is_empty() {
"" // Root node doesn't have a branch
} else if is_last {
"└── " // Last child
} else {
"├── " // Middle child
};
// Print the current node's key and values
print!("{}{}{}", prefix, branch, node.key);
match &node.value {
NodeValue::Single(v) => match v {
NodeValueTypes::String(s) => println!(": ({})", s),
NodeValueTypes::Int(i) => println!(": ({})", i),
},
NodeValue::Multiple(vs) => {
let values: Vec<String> = vs
.iter()
.map(|v| match v {
NodeValueTypes::String(s) => s.to_string(),
NodeValueTypes::Int(i) => i.to_string(),
})
.collect();
println!(": ({})", values.join(", "));
}
}
// Prepare the prefix for child nodes
let new_prefix = if prefix.is_empty() {
if is_last {
" ".to_string()
} else {
"".to_string()
}
} else {
if is_last {
format!("{} ", prefix)
} else {
format!("{}", prefix)
}
};
// Retrieve and iterate through child nodes
if let Some((_first_child_id, _last_child_id)) = node.children {
let children = self.get_children(node_id);
let total = children.len();
for (i, child_id) in children.iter().enumerate() {
let child_is_last = i == total - 1;
self.print_node(*child_id, new_prefix.clone(), child_is_last);
}
}
}
}
fn main() {
let mut tree: QueryTree<i16> = QueryTree::new();
let value = "hello";
let axis = Rc::new("foo".to_string());
let root_id = tree.add_node(&axis, value, None);
use std::time::Instant;
let now = Instant::now();
for _ in 0..100 {
// let child_value = format!("child_val{}", i);
let child_id = tree.add_node(&axis, value, Some(root_id));
// tree.add_value(child_id, value);
for _ in 0..100 {
// let gchild_value = format!("gchild_val{}", j);
let gchild_id = tree.add_node(&axis, value, Some(child_id));
// tree.add_values(gchild_id, vec![1, 2]);
for _ in 0..1000 {
// let ggchild_value = format!("ggchild_val{}", k);
let _ggchild_id = tree.add_node(&axis, value, Some(gchild_id));
// tree.add_value(_ggchild_id, value);
// tree.add_values(_ggchild_id, vec![1, 2, 3, 4]);
}
}
}
assert_eq!(tree.nodes.len(), 10_010_101);
let elapsed = now.elapsed();
println!("Elapsed: {:.2?}", elapsed);
// tree.print_tree();
}

View File

@ -1,76 +0,0 @@
use rsfdb::listiterator::KeyValueLevel;
use rsfdb::request::Request;
use rsfdb::FDB;
use serde_json::{json, Value};
use std::time::Instant;
use std::collections::HashMap;
pub mod tree;
use std::sync::Arc;
use std::sync::Mutex;
use tree::TreeNode;
#[pyclass(unsendable)]
pub struct PyFDB {
pub fdb: FDB,
}
#[pymethods]
impl PyFDB {
#[new]
#[pyo3(signature = (fdb_config=None))]
pub fn new(fdb_config: Option<&str>) -> PyResult<Self> {
let fdb = FDB::new(fdb_config)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))?;
Ok(PyFDB { fdb })
}
/// Traverse the FDB with the given request.
pub fn traverse_fdb(
&self,
py: Python<'_>,
request: HashMap<String, Vec<String>>,
) -> PyResult<PyObject> {
let start_time = Instant::now();
let list_request = Request::from_json(json!(request))
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(e.to_string()))?;
// Use `fdb_guard` instead of `self.fdb`
let list = self
.fdb
.list(&list_request, true, true)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))?;
let mut root = TreeNode::new(KeyValueLevel {
key: "root".to_string(),
value: "root".to_string(),
level: 0,
});
for item in list {
py.check_signals()?;
if let Some(request) = &item.request {
root.insert(&request);
}
}
let duration = start_time.elapsed();
println!("Total runtime: {:?}", duration);
let py_dict = root.to_py_dict(py)?;
Ok(py_dict)
}
}
use pyo3::prelude::*;
#[pymodule]
fn rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyFDB>()?;
Ok(())
}

View File

@ -0,0 +1,139 @@
use std::io;
use std::io::ErrorKind;
use std::io::SeekFrom;
use super::request::Request;
use super::FDB;
use super::FDBLIB;
#[repr(C)]
pub struct FdbDataReader {
_empty: [u8; 0],
}
pub struct DataRetriever {
datareader: *mut FdbDataReader,
opened: bool,
}
impl DataRetriever {
pub fn new(fdb: &FDB, request: &Request) -> Result<Self, String> {
// Create a new data reader
let mut datareader: *mut FdbDataReader = std::ptr::null_mut();
let result = unsafe { (FDBLIB.fdb_new_datareader)(&mut datareader) };
if result != 0 {
return Err("Failed to create data reader".into());
}
// Retrieve data
let result = unsafe { (FDBLIB.fdb_retrieve)(fdb.handle, request.as_ptr(), datareader) };
if result != 0 {
unsafe { (FDBLIB.fdb_delete_datareader)(datareader) };
return Err("Failed to initiate data retrieval".into());
}
Ok(Self {
datareader,
opened: false,
})
}
pub fn open(&mut self) -> Result<(), io::Error> {
if !self.opened {
let result =
unsafe { (FDBLIB.fdb_datareader_open)(self.datareader, std::ptr::null_mut()) };
if result != 0 {
return Err(io::Error::new(
ErrorKind::Other,
"Failed to open data reader",
));
}
self.opened = true;
}
Ok(())
}
pub fn close(&mut self) {
if self.opened {
unsafe { (FDBLIB.fdb_datareader_close)(self.datareader) };
self.opened = false;
}
}
pub fn tell(&mut self) -> Result<libc::c_long, io::Error> {
self.open()?;
let mut pos = 0;
let result = unsafe { (FDBLIB.fdb_datareader_tell)(self.datareader, &mut pos) };
if result != 0 {
return Err(io::Error::new(
ErrorKind::Other,
"Failed to tell in data reader",
));
}
Ok(pos)
}
}
impl std::io::Seek for DataRetriever {
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
let new_pos = match pos {
SeekFrom::Start(offset) => offset as libc::c_long,
SeekFrom::End(_offset) => {
// Don't know size of stream, so can't seek from end
return Err(io::Error::new(
io::ErrorKind::Unsupported,
"Seek from end is not supported for this stream",
));
}
SeekFrom::Current(offset) => {
let current_pos = self.tell()? as i64;
(current_pos + offset) as libc::c_long
}
};
let result = unsafe { (FDBLIB.fdb_datareader_seek)(self.datareader, new_pos) };
if result != 0 {
Err(io::Error::new(
io::ErrorKind::Other,
"Failed to seek in data reader",
))
} else {
Ok(new_pos as u64)
}
}
}
impl std::io::Read for DataRetriever {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.open()?;
let mut read = 0;
let result = unsafe {
(FDBLIB.fdb_datareader_read)(
self.datareader,
buf.as_mut_ptr() as *mut libc::c_void,
buf.len() as libc::c_long,
&mut read,
)
};
if result != 0 {
Err(std::io::Error::new(
std::io::ErrorKind::Other,
"Failed to read from data reader",
))
} else {
Ok(read as usize)
}
}
}
impl Drop for DataRetriever {
fn drop(&mut self) {
self.close();
unsafe {
(FDBLIB.fdb_delete_datareader)(self.datareader);
}
}
}

55
src/rust/fdb/key.rs Normal file
View File

@ -0,0 +1,55 @@
use std::ffi::CString;
use super::CKey;
use super::FDBLIB;
pub struct Key {
key: *mut CKey,
}
#[macro_export]
macro_rules! create_key {
($($key:expr => $values:expr),* $(,)?) => {{
let mut key = Key::new().unwrap();
$(
let _ = key.set($key, &$values);
)*
key
}};
}
impl Key {
pub fn new() -> Result<Self, String> {
let mut key_ptr: *mut CKey = std::ptr::null_mut();
let result = unsafe { (FDBLIB.fdb_new_key)(&mut key_ptr) };
if result != 0 {
return Err("Failed to create new key".into());
}
let key = Self { key: key_ptr };
Ok(key)
}
pub fn set(&mut self, key: &str, value: &str) -> Result<(), String> {
let param_c_str = CString::new(key).map_err(|e| e.to_string())?;
let value_c_str = CString::new(value).map_err(|e| e.to_string())?;
let result =
unsafe { (FDBLIB.fdb_key_add)(self.key, param_c_str.as_ptr(), value_c_str.as_ptr()) };
if result != 0 {
return Err("Failed to add key/value".into());
}
Ok(())
}
}
impl Drop for Key {
fn drop(&mut self) {
unsafe {
(FDBLIB.fdb_delete_key)(self.key);
}
}
}

View File

@ -0,0 +1,180 @@
use libc::size_t;
use std::ffi::CStr;
use std::os::raw::c_char;
use super::{FdbListIterator, FdbSplitKey};
use super::{Request, FDB, FDBLIB};
// Represents an individual key-value pair like {class : rd}, level = 0
#[derive(Debug, PartialEq, Clone)]
pub struct KeyValueLevel {
pub key: String,
pub value: String,
pub level: usize,
}
#[derive(Debug, PartialEq)]
pub struct ListItem {
pub uri: String,
pub offset: usize,
pub length: usize,
pub request: Option<Vec<KeyValueLevel>>,
}
pub struct ListIterator {
handle: *mut FdbListIterator,
key: bool, // Whether we're extracting keys or just path, len, offset for each list item.
}
impl ListIterator {
pub fn new(fdb: &FDB, request: &Request, key: bool, duplicates: bool) -> Result<Self, String> {
let mut it: *mut FdbListIterator = std::ptr::null_mut();
let result =
unsafe { (FDBLIB.fdb_list)(fdb.handle, request.as_ptr(), &mut it, duplicates) };
if result != 0 {
return Err(format!("fdb_list failed with error code {}", result));
}
if it.is_null() {
return Err("fdb_list returned a null iterator".into());
}
Ok(ListIterator {
handle: it,
key: key,
})
}
// Extracts the keys and values from the list item
pub fn get_request_for_key(&self) -> Result<Vec<KeyValueLevel>, String> {
if !self.key {
return Err("Getting keys is not enabled for this iterator.".into());
}
let mut key_ptr: *mut FdbSplitKey = std::ptr::null_mut();
let result = unsafe { (FDBLIB.fdb_new_splitkey)(&mut key_ptr) };
if result != 0 {
return Err(format!(
"fdb_new_splitkey failed with error code {}",
result
));
}
let result = unsafe { (FDBLIB.fdb_listiterator_splitkey)(self.handle, key_ptr) };
if result != 0 {
return Err(format!(
"fdb_listiterator_splitkey failed with error code {}",
result
));
}
if key_ptr.is_null() {
return Err("fdb_listiterator_splitkey returned a null key".into());
}
let mut metadata = Vec::new();
loop {
let mut k: *const c_char = std::ptr::null();
let mut v: *const c_char = std::ptr::null();
let mut level: size_t = 0;
let meta_result =
unsafe { (FDBLIB.fdb_splitkey_next_metadata)(key_ptr, &mut k, &mut v, &mut level) };
if meta_result != 0 || k.is_null() || v.is_null() {
break; // No more metadata
}
let key = unsafe {
CStr::from_ptr(k)
.to_str()
.map_err(|_| "Invalid UTF-8 in splitkey key".to_string())?
.to_owned()
};
let value = unsafe {
CStr::from_ptr(v)
.to_str()
.map_err(|_| "Invalid UTF-8 in splitkey value".to_string())?
.to_owned()
};
metadata.push(KeyValueLevel {
key,
value,
level: level as usize,
});
}
// Clean up the splitkey instance
unsafe {
(FDBLIB.fdb_delete_splitkey)(key_ptr);
}
Ok(metadata)
}
}
impl Iterator for ListIterator {
type Item = ListItem;
fn next(&mut self) -> Option<Self::Item> {
// Advance the iterator
let result = unsafe { (FDBLIB.fdb_listiterator_next)(self.handle) };
if result != 0 {
// Assuming non-zero indicates no more items or an error
return None;
}
// Retrieve attributes
let mut uri_ptr: *const c_char = std::ptr::null();
let mut off: size_t = 0;
let mut len: size_t = 0;
let attrs_result = unsafe {
(FDBLIB.fdb_listiterator_attrs)(self.handle, &mut uri_ptr, &mut off, &mut len)
};
if attrs_result != 0 || uri_ptr.is_null() {
// Handle error or end of iteration
return None;
}
// Convert C string to Rust String
let uri = unsafe {
CStr::from_ptr(uri_ptr)
.to_str()
.unwrap_or("Invalid UTF-8")
.to_owned()
};
// If we're extracting keys, do it.
let request = if self.key {
match self.get_request_for_key() {
Ok(data) => Some(data),
Err(e) => {
eprintln!("Error retrieving splitkey metadata: {}", e);
None
}
}
} else {
None
};
Some(ListItem {
uri,
offset: off as usize,
length: len as usize,
request,
})
}
}
impl Drop for ListIterator {
fn drop(&mut self) {
unsafe {
if !self.handle.is_null() {
(FDBLIB.fdb_delete_listiterator)(self.handle);
}
}
}
}

32
src/rust/fdb/macros.rs Normal file
View File

@ -0,0 +1,32 @@
#[macro_export]
macro_rules! generate_library_wrapper {
(
$lib_name:ident {
$(
fn $func_name:ident($($arg_name:ident : $arg_type:ty),* $(,)?) $(-> $ret_type:ty)?;
)*
}
) => {
pub struct $lib_name {
lib: Arc<libloading::Library>,
$(
pub $func_name: unsafe extern "C" fn($($arg_type),*) $(-> $ret_type)?,
)*
}
impl $lib_name {
pub fn load(lib: libloading::Library) -> Result<Self, Box<dyn std::error::Error>> {
let arc_lib = Arc::new(lib);
Ok(Self {
$(
$func_name: unsafe {
*arc_lib.get::<unsafe extern "C" fn($($arg_type),*) $(-> $ret_type)?>(concat!(stringify!($func_name), "\0").as_bytes())?
},
)*
lib: arc_lib,
})
}
}
};
}

209
src/rust/fdb/mod.rs Normal file
View File

@ -0,0 +1,209 @@
extern crate libc;
use libc::{c_char, c_int, c_long, size_t};
use std::ffi::CString;
pub mod dataretriever;
pub mod key;
pub mod listiterator;
pub mod request;
use dataretriever::DataRetriever;
use dataretriever::FdbDataReader;
use listiterator::ListIterator;
use request::CRequest;
use request::Request;
use libloading::Library;
use once_cell::sync::Lazy;
mod macros;
use crate::generate_library_wrapper;
use std::sync::Arc;
// FDB C API functions
generate_library_wrapper! {
FdbApiWrapper {
fn fdb_new_handle(fdb: *mut *mut FdbHandle) -> c_int;
fn fdb_initialise() -> c_int;
fn fdb_new_handle_from_yaml(fdb: *mut *mut FdbHandle, system_config: *const c_char, user_config: *const c_char) -> c_int;
fn fdb_retrieve(fdb: *mut FdbHandle, req: *mut CRequest, dr: *mut FdbDataReader) -> c_int;
fn fdb_archive_multiple(fdb: *mut FdbHandle, req: *mut CRequest, data: *const c_char, length: size_t) -> c_int;
fn fdb_flush(fdb: *mut FdbHandle) -> c_int;
fn fdb_delete_handle(fdb: *mut FdbHandle);
// Data reader functions
fn fdb_new_datareader(dr: *mut *mut FdbDataReader) -> c_int;
fn fdb_datareader_open(dr: *mut FdbDataReader, size: *mut c_long) -> c_int;
fn fdb_datareader_close(dr: *mut FdbDataReader) -> c_int;
fn fdb_datareader_tell(dr: *mut FdbDataReader, pos: *mut c_long) -> c_int;
fn fdb_datareader_seek(dr: *mut FdbDataReader, pos: c_long) -> c_int;
// fn fdb_datareader_skip(dr: *mut FdbDataReader, count: c_long) -> c_int;
fn fdb_datareader_read(dr: *mut FdbDataReader, buf: *mut libc::c_void, count: c_long, read: *mut c_long) -> c_int;
fn fdb_delete_datareader(dr: *mut FdbDataReader);
// Key functions
fn fdb_new_key(key: *mut *mut CKey) -> c_int;
fn fdb_key_add(key: *mut CKey, param: *const c_char, value: *const c_char) -> c_int;
fn fdb_delete_key(key: *mut CKey);
// Request functions
fn fdb_new_request(request: *mut *mut CRequest) -> c_int;
fn fdb_request_add(request: *mut CRequest, name: *const c_char, values: *const *const c_char, n_values: libc::size_t) -> c_int;
fn fdb_delete_request(request: *mut CRequest);
fn fdb_list(fdb: *mut FdbHandle, req: *mut CRequest, it: *mut *mut FdbListIterator, duplicates : bool) -> c_int;
// ListIterator functions
fn fdb_listiterator_next(it: *mut FdbListIterator) -> c_int;
fn fdb_listiterator_attrs(
it: *mut FdbListIterator,
uri: *mut *const c_char,
off: *mut size_t,
len: *mut size_t,
) -> c_int;
fn fdb_listiterator_splitkey(it: *mut FdbListIterator, key: *mut FdbSplitKey) -> c_int;
fn fdb_delete_listiterator(it: *mut FdbListIterator);
// SplitKey functions, extracts path, len, offset, and request = {key : value} from each key
fn fdb_new_splitkey(key : *mut *mut FdbSplitKey) -> c_int;
fn fdb_splitkey_next_metadata(it : *mut FdbSplitKey, key: *mut *const c_char, value: *mut *const c_char, level: *mut size_t) -> c_int;
fn fdb_delete_splitkey(key : *mut FdbSplitKey);
}
}
// Define the fdb library as a global, lazily-initialized library
pub static FDBLIB: Lazy<Arc<FdbApiWrapper>> = Lazy::new(|| {
let libpath = "/Users/math/micromamba/envs/qubed/lib/libfdb5.dylib";
let raw_lib = Library::new(&libpath).expect("Failed to load library");
let fdblib_wrapper = FdbApiWrapper::load(raw_lib)
.map_err(|e| e.to_string())
.expect("Failed to wrap FDB5 library");
Arc::new(fdblib_wrapper)
});
#[repr(C)]
pub struct FdbSplitKey {
_private: [u8; 0],
}
#[repr(C)]
pub struct FdbListIterator {
_private: [u8; 0],
}
#[repr(C)]
pub struct FdbSplitKeyMetadata {
_private: [u8; 0],
}
#[repr(C)]
pub struct CKey {
_empty: [u8; 0],
}
#[repr(C)]
pub struct FdbHandle {
_empty: [u8; 0],
}
pub struct FDB {
handle: *mut FdbHandle,
}
impl FDB {
pub fn new(config: Option<&str>) -> Result<Self, String> {
let mut handle: *mut FdbHandle = std::ptr::null_mut();
unsafe {
let result = (FDBLIB.fdb_initialise)();
if result != 0 {
return Err("Failed to initialise FDB".into());
}
}
let result: i32 = match config {
Some(cfg) => {
let sys_cfg = CString::new(cfg)
.map_err(|_| "System Config contains null byte".to_string())?;
let usr_cfg =
CString::new("").map_err(|_| "User Config contains null byte".to_string())?;
unsafe {
(FDBLIB.fdb_new_handle_from_yaml)(
&mut handle,
sys_cfg.as_ptr(),
usr_cfg.as_ptr(),
)
}
}
None => unsafe { (FDBLIB.fdb_new_handle)(&mut handle) },
};
if result != 0 {
return Err("Failed to create FDB handle".into());
}
Ok(Self { handle })
}
pub fn archive_multiple(&self, request: Option<&Request>, data: &[u8]) -> Result<(), String> {
let req_ptr = match request {
Some(req) => req.as_ptr(),
None => std::ptr::null_mut(),
};
let result = unsafe {
(FDBLIB.fdb_archive_multiple)(
self.handle,
req_ptr,
data.as_ptr() as *const c_char,
data.len(),
)
};
if result != 0 {
return Err("Failed to archive data".into());
}
Ok(())
}
pub fn flush(&self) -> Result<(), String> {
let result = unsafe { (FDBLIB.fdb_flush)(self.handle) };
if result != 0 {
return Err("Failed to flush FDB".into());
}
Ok(())
}
pub fn retrieve(&self, request: &Request) -> Result<DataRetriever, String> {
DataRetriever::new(self, request)
}
pub fn list(
&self,
request: &Request,
key: bool,
duplicates: bool,
) -> Result<ListIterator, String> {
ListIterator::new(self, request, key, duplicates)
}
}
impl Drop for FDB {
fn drop(&mut self) {
unsafe {
(FDBLIB.fdb_delete_handle)(self.handle);
}
}
}
// // make a small test
// #[cfg(test)]
// mod tests {
// use super::*;
// #[test]
// fn test_fdb_new() {
// let fdb = FDB::new(None);
// assert!(fdb.is_ok());
// }
// }

108
src/rust/fdb/request.rs Normal file
View File

@ -0,0 +1,108 @@
use super::FDBLIB;
use libc::c_char;
use serde_json::Value;
use std::ffi::CString;
#[repr(C)]
pub struct CRequest {
_empty: [u8; 0],
}
pub struct Request {
request: *mut CRequest,
}
impl Request {
pub fn new() -> Result<Self, String> {
let mut request_ptr: *mut CRequest = std::ptr::null_mut();
let result = unsafe { (FDBLIB.fdb_new_request)(&mut request_ptr) };
if result != 0 {
return Err("Failed to create new request".into());
}
let request = Self {
request: request_ptr,
};
Ok(request)
}
pub fn set<'a, C, T>(&mut self, key: &'a str, values: C) -> Result<(), String>
where
C: AsRef<[T]>,
T: AsRef<str> + 'a,
{
let values_slice = values.as_ref();
let key_cstr = CString::new(key).map_err(|_| "Failed to create CString for key")?;
let cvals: Vec<CString> = values_slice
.iter()
.map(|val| CString::new(val.as_ref()).map_err(|_| "Failed to create CString for value"))
.collect::<Result<Vec<_>, _>>()?;
let cvals_ptrs: Vec<*const c_char> = cvals.iter().map(|cstr| cstr.as_ptr()).collect();
let result = unsafe {
(FDBLIB.fdb_request_add)(
self.request,
key_cstr.as_ptr(),
cvals_ptrs.as_ptr(),
values_slice.len(),
)
};
if result != 0 {
return Err(format!("Failed to add values for key '{}'", key));
}
Ok(())
}
pub fn as_ptr(&self) -> *mut CRequest {
self.request
}
pub fn from_json(v: serde_json::Value) -> Result<Self, String> {
let mut request = Self::new()?;
// Iterate over the JSON object and populate the Request
if let Value::Object(map) = v {
for (key, value) in map {
match value {
Value::String(s) => {
// Treat single strings as a slice of length 1
request.set(&key, &[s])?;
}
Value::Array(arr) => {
// Collect string values from the array
let values: Vec<String> = arr
.into_iter()
.filter_map(|val| {
if let Value::String(s) = val {
Some(s)
} else {
None // You can handle non-string items here if needed
}
})
.collect();
request.set(&key, &values)?;
}
_ => {
// Handle other types if necessary
return Err(format!("Unsupported value type for key '{}'", key).into());
}
}
}
} else {
return Err("Expected a JSON object at the root".into());
}
Ok(request)
}
}
impl Drop for Request {
fn drop(&mut self) {
unsafe {
(FDBLIB.fdb_delete_request)(self.request);
}
}
}

View File

@ -1,147 +0,0 @@
use crate::{Node, NodeId, Qube};
use itertools::Itertools;
use itertools::Position;
impl Node {
/// Generate a human readable summary of the node
/// Examples include: key=value1/value2/.../valueN, key=value1/to/value1, key=*, root etc
pub fn summary(&self, qube: &Qube) -> String {
if self.is_root() {
return "root".to_string();
}
let key = &qube[self.key];
let values: String =
Itertools::intersperse(self.values.iter().map(|id| &qube[*id]), "/").collect();
format!("{}={}", key, values)
}
pub fn html_summary(&self, qube: &Qube) -> String {
if self.is_root() {
return r#"<span class="qubed-node">root</span>"#.to_string();
}
let key = &qube[self.key];
let values: String =
Itertools::intersperse(self.values.iter().map(|id| &qube[*id]), "/").collect();
let summary = format!("{}={}", key, values);
let path = summary.clone();
let info = format!("is_root: {}", self.is_root());
format!(r#"<span class="qubed-node" data-path="{path}" title="{info}">{summary}</span>"#)
}
}
struct NodeSummary {
summary: String,
end: NodeId,
}
enum SummaryType {
PlainText,
HTML,
}
/// Given a Node, traverse the tree until a node has more than one child.
/// Returns a summary of the form "key1=v1/v2, key2=v1/v2/v3, key3=v1"
/// and the id of the last node in the summary
fn summarise_nodes(qube: &Qube, node_id: &NodeId, summary_type: SummaryType) -> NodeSummary {
let mut node_id = *node_id;
let mut summary_vec = vec![];
loop {
let node = &qube[node_id];
let summary = match summary_type {
SummaryType::PlainText => node.summary(&qube),
SummaryType::HTML => node.html_summary(&qube),
};
summary_vec.push(summary);
// Bail out if the node has anothing other than 1 child.
match node.has_exactly_one_child() {
Some(n) => node_id = n,
None => break,
};
}
NodeSummary {
summary: summary_vec.join(", "),
end: node_id,
}
}
fn qube_to_tree(qube: &Qube, node_id: &NodeId, prefix: &str, depth: usize) -> String {
let NodeSummary {
summary,
end: node_id,
} = summarise_nodes(qube, node_id, SummaryType::PlainText);
let mut output: Vec<String> = Vec::new();
if depth <= 0 {
return format!("{} - ...\n", summary);
} else {
output.push(format!("{}\n", summary));
}
let node = &qube[node_id];
for (position, child_id) in node.children().with_position() {
let (connector, extension) = match position {
Position::Last | Position::Only => ("└── ", " "),
_ => ("├── ", ""),
};
output.extend([
prefix.to_string(),
connector.to_string(),
qube_to_tree(qube, child_id, &format!("{prefix}{extension}"), depth - 1),
]);
}
output.join("")
}
fn qube_to_html(qube: &Qube, node_id: &NodeId, prefix: &str, depth: usize) -> String {
let NodeSummary {
summary,
end: node_id,
} = summarise_nodes(qube, node_id, SummaryType::PlainText);
let node = &qube[node_id];
let mut output: Vec<String> = Vec::new();
let open = if depth > 0 { "open" } else { "" };
output.push(format!(
r#"<details {open}><summary class="qubed-level">{summary}</summary>"#
));
for (position, child_id) in node.children().with_position() {
let (connector, extension) = match position {
Position::Last | Position::Only => ("└── ", " "),
_ => ("├── ", ""),
};
output.extend([
prefix.to_string(),
connector.to_string(),
qube_to_tree(qube, child_id, &format!("{prefix}{extension}"), depth - 1),
]);
}
output.join("")
}
impl Qube {
/// Return a string version of the Qube in the format
/// root
/// ├── class=od, expver=0001/0002, param=1/2
/// └── class=rd, param=1/2/3
pub fn string_tree(&self) -> String {
qube_to_tree(&self, &self.root, "", 5)
}
/// Return an HTML version of the Qube which renders like this
/// root
/// ├── class=od, expver=0001/0002, param=1/2
/// └── class=rd, param=1/2/3
/// But under the hood children are represented with a details/summary tag and each key=value is a span
/// CSS and JS functionality is bundled inside.
pub fn html_tree(&self) -> String {
qube_to_html(&self, &self.root, "", 5)
}
}

View File

@ -1,235 +1,109 @@
#![allow(unused_imports)]
// #![allow(dead_code)]
// #![allow(unused_variables)]
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
use pyo3::types::{PyDict, PyInt, PyList, PyString};
use python_interface::QubeError;
use std::collections::HashMap;
use std::iter;
use pyo3::prelude::*;
use std::hash::Hash;
use std::rc::Rc;
use lasso::{Rodeo, Spur};
use std::num::NonZero;
use std::ops;
mod serialisation;
mod python_interface;
mod formatters;
mod set_operations;
// This data structure uses the Newtype Index Pattern
// See https://matklad.github.io/2018/06/04/newtype-index-pattern.html
// See also https://github.com/nrc/r4cppp/blob/master/graphs/README.md#rcrefcellnode for a discussion of other approaches to trees and graphs in rust.
// https://smallcultfollowing.com/babysteps/blog/2015/04/06/modeling-graphs-in-rust-using-vector-indices/
// Index types use struct Id(NonZero<usize>)
// This reserves 0 as a special value which allows Option<Id(NonZero<usize>)> to be the same size as usize.
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub(crate) struct NodeId(NonZero<usize>);
// Allow node indices to index directly into Qubes:
impl ops::Index<NodeId> for Qube {
type Output = Node;
fn index(&self, index: NodeId) -> &Node {
&self.nodes[index.0.get() - 1]
}
#[pyfunction]
fn hello(_py: Python, name: &str) -> PyResult<String> {
Ok(format!("Hello, {}!", name))
}
impl ops::IndexMut<NodeId> for Qube {
fn index_mut(&mut self, index: NodeId) -> &mut Node {
&mut self.nodes[index.0.get() - 1]
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hello() {
let out = Python::with_gil(|py| hello(py, "world"));
assert_eq!(out.unwrap(), "Hello, world!");
}
}
impl ops::Index<StringId> for Qube {
type Output = str;
fn index(&self, index: StringId) -> &str {
&self.strings[index]
}
}
impl NodeId {
pub fn new(value: usize) -> Option<NodeId> {
NonZero::new(value).map(NodeId)
}
}
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
struct StringId(lasso::Spur);
impl ops::Index<StringId> for lasso::Rodeo {
type Output = str;
fn index(&self, index: StringId) -> &str {
&self[index.0]
}
}
#[derive(Debug, Clone)]
pub(crate) struct Node {
pub key: StringId,
pub metadata: HashMap<StringId, Vec<String>>,
pub parent: Option<NodeId>, // If not present, it's the root node
pub values: Vec<StringId>,
pub children: HashMap<StringId, Vec<NodeId>>,
}
impl Node {
fn new_root(q: &mut Qube) -> Node {
Node {
key: q.get_or_intern("root"),
metadata: HashMap::new(),
parent: None,
values: vec![],
children: HashMap::new(),
}
}
fn children(&self) -> impl Iterator<Item = &NodeId> {
self.children.values().flatten()
}
fn is_root(&self) -> bool {
self.parent.is_none()
}
/// Because children are stored grouped by key
/// determining the number of children quickly takes a little effort.
/// This is a fast method for the special case of checking if a Node has exactly one child.
/// Returns Ok(NodeId) if there is one child else None
fn has_exactly_one_child(&self) -> Option<NodeId> {
if self.children.len() != 1 {return None}
let Some(value_group) = self.children.values().next() else {return None};
let [node_id] = &value_group.as_slice() else {return None};
Some(*node_id)
}
fn n_children(&self) -> usize {
self.children
.values()
.map(|v| v.len())
.sum()
}
fn keys<'a>(&'a self, q: &'a Qube) -> impl Iterator<Item = &'a str> {
self.children.keys()
.map(|s| {&q[*s]})
}
}
#[derive(Debug, Clone)]
#[pyclass(subclass, dict)]
pub struct Qube {
pub root: NodeId,
nodes: Vec<Node>,
strings: Rodeo,
}
impl Qube {
pub fn new() -> Self {
let mut q = Self {
root: NodeId::new(1).unwrap(),
nodes: Vec::new(),
strings: Rodeo::default(),
};
let root = Node::new_root(&mut q);
q.nodes.push(root);
q
}
fn get_or_intern(&mut self, val: &str) -> StringId {
StringId(self.strings.get_or_intern(val))
}
pub(crate) fn add_node(&mut self, parent: NodeId, key: &str, values: impl IntoIterator<Item = impl AsRef<str>>) -> NodeId {
let key_id = self.get_or_intern(key);
let values = values.into_iter().map(|val| self.get_or_intern(val.as_ref())).collect();
// Create the node object
let node = Node {
key: key_id,
metadata: HashMap::new(),
values: values,
parent: Some(parent),
children: HashMap::new(),
};
// Insert it into the Qube arena and determine its id
self.nodes.push(node);
let node_id = NodeId::new(self.nodes.len()).unwrap();
// Add a reference to this node's id to the parents list of children.
let parent_node = &mut self[parent];
let key_group = parent_node.children.entry(key_id).or_insert(Vec::new());
key_group.push(node_id);
node_id
}
fn print(&self, node_id: Option<NodeId>) -> String {
let node_id: NodeId = node_id.unwrap_or(self.root);
let node = &self[node_id];
node.summary(&self)
}
fn get_node_ref(&self, id: NodeId) -> NodeRef {
let node = &self[id];
NodeRef { id: id, node: &node, qube: &self }
}
pub fn get_string_id(&self, s: &str) -> Option<StringId> {
self.strings.get(s)
.map(|id| StringId(id))
}
}
#[pymodule]
fn rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<Qube>()?;
m.add("QubeError", py.get_type::<python_interface::QubeError>())?;
fn rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(hello, m)?).unwrap();
Ok(())
}
pub struct NodeRef<'a> {
pub id: NodeId,
pub node: &'a Node,
pub qube: &'a Qube,
}
mod fdb;
use fdb::listiterator::KeyValueLevel;
use fdb::request::Request;
use fdb::FDB;
impl<'a> NodeRef<'a> {
pub fn keys(&self) -> impl Iterator<Item = &str> {
self.node.keys(self.qube)
}
fn flat_children(&'a self) -> impl Iterator<Item = Self> {
self.node.children
.values()
.flatten()
.map(|id| {
NodeRef { id: *id, node: &self.qube[*id], qube: self.qube }
})
}
fn children_by_key(&'a self, key: &str) -> impl Iterator<Item = Self> {
let id = self.qube.get_string_id(key);
let children = id
.map(|i| self.node.children.get(&i))
.flatten();
children.map(
|ids| ids.into_iter().map(
|id| {
NodeRef { id: *id, node: &self.qube[*id], qube: self.qube }
})).into_iter().flatten()
}
// use serde_json::{json, Value};
// use std::time::Instant;
}
// use std::collections::HashMap;
// pub mod tree;
// use std::sync::Arc;
// use std::sync::Mutex;
// use tree::TreeNode;
// #[pyclass(unsendable)]
// pub struct PyFDB {
// pub fdb: FDB,
// }
// #[pymethods]
// impl PyFDB {
// #[new]
// #[pyo3(signature = (fdb_config=None))]
// pub fn new(fdb_config: Option<&str>) -> PyResult<Self> {
// let fdb = FDB::new(fdb_config)
// .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))?;
// Ok(PyFDB { fdb })
// }
// /// Traverse the FDB with the given request.
// pub fn traverse_fdb(
// &self,
// py: Python<'_>,
// request: HashMap<String, Vec<String>>,
// ) -> PyResult<PyObject> {
// let start_time = Instant::now();
// let list_request = Request::from_json(json!(request))
// .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(e.to_string()))?;
// // Use `fdb_guard` instead of `self.fdb`
// let list = self
// .fdb
// .list(&list_request, true, true)
// .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))?;
// let mut root = TreeNode::new(KeyValueLevel {
// key: "root".to_string(),
// value: "root".to_string(),
// level: 0,
// });
// for item in list {
// py.check_signals()?;
// if let Some(request) = &item.request {
// root.insert(&request);
// }
// }
// let duration = start_time.elapsed();
// println!("Total runtime: {:?}", duration);
// let py_dict = root.to_py_dict(py)?;
// Ok(py_dict)
// }
// }
// use pyo3::prelude::*;
// #[pymodule]
// fn rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
// m.add_class::<PyFDB>()?;
// Ok(())
// }

View File

@ -1,179 +0,0 @@
use crate::{Node, NodeId, Qube, NodeRef};
use pyo3::prelude::*;
use pyo3::types::{PyList, PyType};
use core::borrow;
use std::ops::Deref;
use std::cell::Ref;
use crate::set_operations;
use crate::serialisation;
use itertools::Itertools;
use pyo3::create_exception;
create_exception!(qubed, QubeError, pyo3::exceptions::PyException);
/// A reference to a particular node in a Qube
#[pyclass]
pub struct PyNodeRef {
id: NodeId,
qube: Py<Qube>, // see https://pyo3.rs/v0.23.1/types for a discussion of Py<T> and Bound<'py, T>
}
fn into_py_node_ref(node_ref: NodeRef, qube: Py<Qube>) -> PyNodeRef {
PyNodeRef {
id: node_ref.id,
qube: qube,
}
}
#[pymethods]
impl PyNodeRef {
fn __repr__(&self, py: Python) -> PyResult<String> {
// Get the Py<Qube> reference, bind it to the GIL.
let qube = self.qube.bind(py);
fn repr_helper<'py>(node_id: NodeId, qube: &Bound<'py, Qube>) -> String {
let node = &qube.borrow()[node_id];
let key = &qube.borrow()[node.key];
let children = node
.children
.values()
.flatten()
.map(|child_id| repr_helper(child_id.clone(), qube))
.collect::<Vec<String>>()
.join(", ");
format!("Node({}, {})", key, children)
}
Ok(repr_helper(self.id, qube))
}
fn __str__(&self, py: Python) -> String {
let qube = self.qube.bind(py).borrow();
let node = &qube[self.id];
let key = &qube.strings[node.key];
format!("Node({})", key)
}
#[getter]
pub fn get_children(&self, py: Python) -> Vec<Self> {
let qube = self.qube.bind(py).borrow();
let node = &qube[self.id];
node.children
.values()
.flatten()
.map(|child_id| Self {
id: *child_id,
qube: self.qube.clone_ref(py),
})
.collect()
}
}
#[derive(FromPyObject)]
pub enum OneOrMany<T> {
One(T),
Many(Vec<T>),
}
// Todo: Is there a way to rewrite this so that is doesn't allocate?
// Perhaps by returning an iterator?
impl<T> Into<Vec<T>> for OneOrMany<T> {
fn into(self) -> Vec<T> {
match self {
OneOrMany::One(v) => vec![v],
OneOrMany::Many(vs) => vs,
}
}
}
#[pymethods]
impl Qube {
#[new]
pub fn py_new() -> Self {
Qube::new()
}
#[pyo3(name = "add_node")]
pub fn py_add_node(
slf: Bound<'_, Self>,
parent: PyRef<'_, PyNodeRef>,
key: &str,
values: OneOrMany<String>,
) -> PyResult<PyNodeRef> {
// Check that the given parent is actually in this qube and not another one
if !parent.qube.bind(slf.py()).is(&slf) {
return Err(QubeError::new_err("Supplied parent node is not in the target qube."))
}
// massage values from T | Vec<T> into Vec<T>
let values: Vec<String> = values.into();
let mut q = slf.borrow_mut();
let node_id = q.add_node(parent.id, key, &values);
Ok(PyNodeRef { id: node_id, qube: slf.into()})
}
pub fn set_root(
slf: Bound<'_, Self>,
node: PyRef<'_, PyNodeRef>,
) -> () {
let mut q = slf.borrow_mut();
q.root = node.id;
}
#[getter]
fn get_root(slf: Bound<'_, Self>) -> PyResult<PyNodeRef> {
Ok(PyNodeRef {
id: slf.borrow().root,
qube: slf.unbind(),
})
}
fn __repr__(&self) -> String {
// format!("{:?}", self)
let nodes_str: String = self.nodes.iter()
.enumerate()
.map(|(id, node)| {
format!("{{id: {}, key: {}, values: [{}], children: [{}]}}",
id+1,
&self[node.key],
node.values.iter().map(|s| &self[*s]).join(", "),
node.children().map(|n| n.0).join(", "),
)
}).join(", ");
format!("Qube {{root: {}, nodes: {}}}", self.root.0, nodes_str)
}
fn __str__<'py>(&self) -> String {
self.string_tree()
}
fn _repr_html_(&self) -> String {
self.html_tree()
}
#[pyo3(name = "print")]
fn py_print(&self) -> String {
self.print(Option::None)
}
#[getter]
pub fn get_children(slf: Bound<'_, Self>, py: Python) -> PyResult<Vec<PyNodeRef>> {
let root = PyNodeRef {
id: slf.borrow().root,
qube: slf.unbind(),
};
Ok(root.get_children(py))
}
#[staticmethod]
pub fn from_json(data: &str) -> Result<Self, serialisation::JSONError> {
serialisation::from_json(data)
}
pub fn __or__(slf: Bound<'_, Self>, other: Bound<'_, Qube>) -> Qube {
set_operations::set_operation(&slf.borrow(), &other.borrow(), set_operations::Op::Union)
}
}

View File

@ -1,80 +0,0 @@
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;
use crate::{Node, NodeId, Qube};
// Use a newtype wrapper to allow us to implement auto conversion from serde_json::Error to PyErr
// via a wrapper intermediate
// see https://pyo3.rs/main/function/error-handling.html#foreign-rust-error-types
pub struct JSONError(serde_json::Error);
impl From<JSONError> for PyErr {
fn from(error: JSONError) -> Self {
PyValueError::new_err(format!("{}", error.0))
}
}
impl From<serde_json::Error> for JSONError {
fn from(other: serde_json::Error) -> Self {
Self(other)
}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "dtype")]
enum Ranges {
Int64{values: Vec<(i64, i64)>}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "dtype", rename_all = "lowercase")]
enum Enum {
Str{values: Vec<String>}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "type", rename_all = "lowercase")]
enum Values {
Wildcard{},
Enum(Enum),
Range(Ranges)
}
#[derive(Serialize, Deserialize, Debug)]
struct JSONQube {
key: String,
values: Values,
metadata: HashMap<String, String>,
children: Vec<JSONQube>,
}
fn add_nodes(qube: &mut Qube, parent: NodeId, nodes: &[JSONQube]) -> Vec<NodeId> {
nodes
.iter()
.map(|json_node| {
let values = match &json_node.values {
Values::Wildcard{} => &vec!["*"],
Values::Enum(Enum::Str{values}) => &values.iter().map(|s| s.as_str()).collect(),
Values::Range(_) => todo!(),
};
let node_id = qube.add_node(parent, &json_node.key, values);
//
add_nodes(qube, node_id, &json_node.children);
node_id
})
.collect()
}
pub fn from_json(data: &str) -> Result<Qube, JSONError> {
// Parse the string of data into serde_json::Value.
let json_qube: JSONQube = serde_json::from_str(data).expect("JSON parsing failed");
let mut qube = Qube::new();
let root = qube.root;
add_nodes(&mut qube, root, &json_qube.children);
Ok(qube)
}

View File

@ -1,2 +0,0 @@
mod json;
pub use json::{from_json, JSONError};

View File

@ -1,40 +0,0 @@
use crate::NodeRef;
use crate::{Node, NodeId, Qube};
use itertools::chain;
use std::collections::HashSet;
pub enum Op {
Union,
Intersection,
Difference,
SymmetricDifference,
}
fn op_to_venn_diagram(op: Op) -> (bool, bool, bool) {
use Op::*;
match op {
Union => (true, true, true),
Intersection => (false, true, false),
Difference => (true, false, false),
SymmetricDifference => (true, false, true),
}
}
pub fn set_operation<'a>(a: &'a Qube, b: &'a Qube, op: Op) -> Qube {
todo!()
// _set_operation(a.root_ref(), a.root_ref(), op)
}
// fn _set_operation<'a>(a: NodeRef, b: NodeRef, op: Op) -> Qube {
// let keys: HashSet<&str> = HashSet::from_iter(chain(a.keys(), b.keys()));
// for key in keys {
// let a = a.children_by_key(key)
// }
// todo!()
// }
pub fn set_operation_inplace<'a>(a: &'a mut Qube, b: &'a Qube, op: Op) -> &'a Qube {
a
}

82
src/rust/tree.rs Normal file
View File

@ -0,0 +1,82 @@
// use pyo3::prelude::*;
// use pyo3::types::PyDict;
// use rsfdb::listiterator::KeyValueLevel;
// use serde_json::Value;
// #[derive(Debug)]
// pub struct TreeNode {
// pub key: KeyValueLevel,
// pub children: Vec<TreeNode>,
// }
// impl TreeNode {
// pub fn new(key: KeyValueLevel) -> Self {
// TreeNode {
// key,
// children: Vec::new(),
// }
// }
// pub fn insert(&mut self, path: &[KeyValueLevel]) {
// if path.is_empty() {
// return;
// }
// let kvl = &path[0];
// // Check if a child with the same key and value exists
// if let Some(child) = self.children.iter_mut().find(|child| child.key == *kvl) {
// // Insert the remaining path into the existing child
// child.insert(&path[1..]);
// } else {
// // Create a new child node
// let mut new_child = TreeNode::new(kvl.clone());
// new_child.insert(&path[1..]);
// self.children.push(new_child);
// }
// }
// pub fn traverse<F>(&self, level: usize, callback: &F)
// where
// F: Fn(&TreeNode, usize),
// {
// callback(self, level);
// for child in &self.children {
// child.traverse(level + 1, callback);
// }
// }
// pub fn to_json(&self) -> Value {
// let formatted_key = format!("{}={}", self.key.key, self.key.value);
// let children_json: Value = if self.children.is_empty() {
// Value::Object(serde_json::Map::new())
// } else {
// Value::Object(
// self.children
// .iter()
// .map(|child| {
// (
// format!("{}={}", child.key.key, child.key.value),
// child.to_json(),
// )
// })
// .collect(),
// )
// };
// // Combine the formatted key with children
// serde_json::json!({ formatted_key: children_json })
// }
// pub fn to_py_dict(&self, py: Python) -> PyResult<PyObject> {
// let py_dict = PyDict::new(py);
// for child in &self.children {
// let child_key = format!("{}={}", child.key.key, child.key.value);
// py_dict.set_item(child_key, child.to_py_dict(py)?)?;
// }
// Ok(py_dict.to_object(py))
// }
// }

View File

@ -1,21 +1,17 @@
import json
import os
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict
import requests
import redis
import yaml
from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, HTMLResponse
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from frozendict import frozendict
from qubed import Qube
from qubed.tree_formatters import node_tree_to_html
from fastapi.responses import FileResponse
from tree_traverser import CompressedTree
app = FastAPI()
security = HTTPBearer()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
@ -24,97 +20,39 @@ app.add_middleware(
allow_headers=["*"],
)
app.mount("/static", StaticFiles(directory="static"), name="static")
templates = Jinja2Templates(directory="templates")
qubes: dict[str, Qube] = {}
# print("Getting climate and extremes dt data from github")
# try:
# qubes["climate-dt"] = Qube.from_json(
# requests.get(
# "https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json",
# timeout=3).json()
# )
# qubes["extremes-dt"] = Qube.from_json(
# requests.get(
# "https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/extremes_dt.json",
# timeout=3).json()
# )
# mars_language = yaml.safe_load(
# requests.get(
# "https://github.com/ecmwf/qubed/raw/refs/heads/main/config/climate-dt/language.yaml",
# timeout=3).content
# )
# except:
qubes["climate-dt"] = Qube.empty()
qubes["extremes-dt"] = Qube.empty()
mars_language = {}
@app.get("/favicon.ico", include_in_schema=False)
async def favicon():
return FileResponse("favicon.ico")
if "LOCAL_CACHE" in os.environ:
print("Getting climate and extremes dt data from local files")
with open("../tests/example_qubes/climate_dt.json") as f:
qubes["climate-dt"] = Qube.from_json(json.load(f))
print("Getting data from local file")
with open("../tests/example_qubes/extremes_dt.json") as f:
qubes["climate-dt"] = qubes["climate-dt"] | Qube.from_json(json.load(f))
base = Path(os.environ["LOCAL_CACHE"])
with open(base / "compressed_tree.json", "r") as f:
json_tree = f.read()
with open("../tests/example_qubes/od.json") as f:
qubes["climate-dt"] = qubes["climate-dt"] | Qube.from_json(json.load(f))
with open("../config/language/language.yaml", "r") as f:
with open(base / "language.yaml", "r") as f:
mars_language = yaml.safe_load(f)["_field"]
with open("../config/language/paramids.yaml", "r") as f:
params = yaml.safe_load(f)
else:
print("Getting climate and extremes dt data from github")
qubes["climate-dt"] = Qube.from_json(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/climate_dt.json",
timeout=1,
).json()
)
qubes["extremes-dt"] = Qube.from_json(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/extremes_dt.json",
timeout=1,
).json()
)
print("Getting cache from redis")
r = redis.Redis(host="redis", port=6379, db=0)
json_tree = r.get("compressed_catalog")
assert json_tree, "No compressed tree found in redis"
mars_language = json.loads(r.get("mars_language"))
qubes["od"] = Qube.from_json(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/tests/example_qubes/od.json",
timeout=1,
).json()
)
qubes["climate-dt"] = qubes["climate-dt"] | qubes["extremes-dt"] | qubes["od"]
mars_language = yaml.safe_load(
requests.get(
"https://github.com/ecmwf/qubed/raw/refs/heads/main/config/climate-dt/language.yaml",
timeout=3,
).content
)["_field"]
print("Loading tree from json")
c_tree = CompressedTree.from_json(json.loads(json_tree))
if "API_KEY" in os.environ:
api_key = os.environ["API_KEY"]
else:
with open("api_key.secret", "r") as f:
api_key = f.read()
print("Partialy decompressing tree, shoud be able to skip this step in future.")
tree = c_tree.reconstruct_compressed_ecmwf_style()
print("Ready to serve requests!")
def validate_key(key: str):
if key not in qubes:
raise HTTPException(status_code=404, detail=f"Qube {key} not found")
return key
async def get_body_json(request: Request):
return await request.json()
def parse_request(request: Request) -> dict[str, str | list[str]]:
def request_to_dict(request: Request) -> Dict[str, Any]:
# Convert query parameters to dictionary format
request_dict = dict(request.query_params)
for key, value in request_dict.items():
@ -125,68 +63,116 @@ def parse_request(request: Request) -> dict[str, str | list[str]]:
return request_dict
def validate_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
if credentials.credentials != api_key:
raise HTTPException(status_code=403, detail="Incorrect API Key")
return credentials
def match_against_cache(request, tree):
if not tree:
return {"_END_": {}}
matches = {}
for k, subtree in tree.items():
if len(k.split("=")) != 2:
raise ValueError(f"Key {k} is not in the correct format")
key, values = k.split("=")
values = set(values.split(","))
if key in request:
if isinstance(request[key], list):
matching_values = ",".join(
request_value
for request_value in request[key]
if request_value in values
)
if matching_values:
matches[f"{key}={matching_values}"] = match_against_cache(
request, subtree
)
elif request[key] in values:
matches[f"{key}={request[key]}"] = match_against_cache(request, subtree)
if not matches:
return {k: {} for k in tree.keys()}
return matches
@app.get("/favicon.ico", include_in_schema=False)
async def favicon():
return FileResponse("favicon.ico")
def max_tree_depth(tree):
"Figure out the maximum depth of a tree"
if not tree:
return 0
return 1 + max(max_tree_depth(v) for v in tree.values())
@app.get("/", response_class=HTMLResponse)
async def read_root(request: Request):
return templates.TemplateResponse(
"index.html",
{
"request": request,
"config": {
"message": "Hello from the dev server!",
},
"api_url": os.environ.get("API_URL", "/api/v1/"),
},
)
def prune_short_branches(tree, depth=None):
if depth is None:
depth = max_tree_depth(tree)
return {
k: prune_short_branches(v, depth - 1)
for k, v in tree.items()
if max_tree_depth(v) == depth - 1
}
@app.get("/api/v1/keys/")
async def keys():
return list(qubes.keys())
def get_paths_to_leaves(tree):
for k, v in tree.items():
if not v:
yield [
k,
]
else:
for leaf in get_paths_to_leaves(v):
yield [
k,
] + leaf
@app.get("/api/v1/get/{key}/")
async def get(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
return qubes[key].to_json()
def get_leaves(tree):
for k, v in tree.items():
if not v:
yield k
else:
for leaf in get_leaves(v):
yield leaf
@app.post("/api/v1/union/{key}/")
async def union(
key: str,
credentials: HTTPAuthorizationCredentials = Depends(validate_api_key),
body_json=Depends(get_body_json),
):
if key not in qubes:
qubes[key] = Qube.empty()
q = Qube.from_json(body_json)
qubes[key] = qubes[key] | q
return qubes[key].to_json()
@app.get("/api/tree")
async def get_tree(request: Request):
request_dict = request_to_dict(request)
print(c_tree.multi_match(request_dict))
return c_tree.multi_match(request_dict)
def follow_query(request: dict[str, str | list[str]], qube: Qube):
s = qube.select(request, mode="next_level", consume=False)
@app.get("/api/match")
async def get_match(request: Request):
# Convert query parameters to dictionary format
request_dict = request_to_dict(request)
# Run the schema matching logic
match_tree = match_against_cache(request_dict, tree)
# Prune the tree to only include branches that are as deep as the deepest match
# This means if you don't choose a certain branch at some point
# the UI won't keep nagging you to choose a value for that branch
match_tree = prune_short_branches(match_tree)
return match_tree
@app.get("/api/paths")
async def api_paths(request: Request):
request_dict = request_to_dict(request)
match_tree = match_against_cache(request_dict, tree)
match_tree = prune_short_branches(match_tree)
paths = get_paths_to_leaves(match_tree)
# deduplicate leaves based on the key
by_path = defaultdict(lambda: {"paths": set(), "values": set()})
for p in paths:
if p[-1] == "_END_":
continue
key, values = p[-1].split("=")
values = values.split(",")
path = tuple(p[:-1])
for request, node in s.leaf_nodes():
if not node.metadata.get("is_leaf", True):
by_path[node.key]["values"].update(node.values.values)
by_path[node.key]["paths"].add(frozendict(request))
by_path[key]["values"].update(values)
by_path[key]["paths"].add(tuple(path))
return s, [
return [
{
"paths": list(v["paths"]),
"key": key,
@ -196,140 +182,49 @@ def follow_query(request: dict[str, str | list[str]], qube: Qube):
]
@app.get("/api/v1/select/{key}/")
async def select(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
q = qubes[key].select(request)
return q.to_json()
@app.get("/api/v1/query/{key}")
async def query(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
qube, paths = follow_query(request, qubes[key])
return paths
@app.get("/api/v1/basicstac/{key}/{filters:path}")
async def basic_stac(filters: str, key: str = Depends(validate_key)):
pairs = filters.strip("/").split("/")
request = dict(p.split("=") for p in pairs if "=" in p)
qube, _ = follow_query(request, qubes[key])
def make_link(child_request):
"""Take a MARS Key and information about which paths matched up to this point and use it to make a STAC Link"""
kvs = [f"{key}={value}" for key, value in child_request.items()]
href = f"/api/v1/basicstac/{key}/{'/'.join(kvs)}"
last_key, last_value = list(child_request.items())[-1]
return {
"title": f"{last_key}={last_value}",
"href": href,
"rel": "child",
"type": "application/json",
}
# Format the response as a STAC collection
(this_key, this_value), *_ = (
list(request.items())[-1] if request else ("root", "root"),
None,
)
key_info = mars_language.get(this_key, {})
try:
values_info = dict(key_info.get("values", {}))
value_info = values_info.get(
this_value, f"No info found for value `{this_value}` found."
)
except ValueError:
value_info = f"No info found for value `{this_value}` found."
if this_key == "root":
value_info = "The root node"
# key_desc = key_info.get(
# "description", f"No description for `key` {this_key} found."
# )
print(this_key, this_value)
print(this_key, key_info)
stac_collection = {
"type": "Catalog",
"stac_version": "1.0.0",
"id": "root"
if not request
else "/".join(f"{k}={v}" for k, v in request.items()),
"title": f"{this_key}={this_value}",
"description": value_info,
"links": [make_link(leaf) for leaf in qube.leaves()],
# "debug": {
# "qube": str(qube),
# },
}
return stac_collection
@app.get("/api/v1/stac/{key}/")
async def get_STAC(
key: str = Depends(validate_key),
request: dict[str, str | list[str]] = Depends(parse_request),
):
qube, paths = follow_query(request, qubes[key])
kvs = [
f"{k}={','.join(v)}" if isinstance(v, list) else f"{k}={v}"
for k, v in request.items()
]
request_params = "&".join(kvs)
@app.get("/api/stac")
async def get_STAC(request: Request):
request_dict = request_to_dict(request)
paths = await api_paths(request)
def make_link(key_name, paths, values):
"""Take a MARS Key and information about which paths matched up to this point and use it to make a STAC Link"""
href_template = f"/stac?{request_params}{'&' if request_params else ''}{key_name}={{{key_name}}}"
path = paths[0]
href_template = f"/stac?{'&'.join(path)}{'&' if path else ''}{key_name}={{}}"
optional = [False]
# optional_str = (
# "Yes"
# if all(optional) and len(optional) > 0
# else ("Sometimes" if any(optional) else "No")
# )
values_from_mars_language = mars_language.get(key_name, {}).get("values", [])
print(f"{key_name = }")
if key_name == "param":
print(params)
values_from_mars_language = params
value_descriptions = [
max(params.get(int(v), [""]), key=len) for v in values
]
print(value_descriptions)
else:
values_from_mars_language = mars_language.get(key_name, {}).get(
"values", []
)
# values = [v[0] if isinstance(v, list) else v for v in values_from_mars_language]
if all(isinstance(v, list) for v in values_from_mars_language):
value_descriptions_dict = {
k: v[-1]
for v in values_from_mars_language
if len(v) > 1
for k in v[:-1]
}
value_descriptions = [
value_descriptions_dict.get(v, "") for v in values
]
if not any(value_descriptions):
value_descriptions = None
if all(isinstance(v, list) for v in values_from_mars_language):
value_descriptions_dict = {
k: v[-1]
for v in values_from_mars_language
if len(v) > 1
for k in v[:-1]
}
value_descriptions = [value_descriptions_dict.get(v, "") for v in values]
if not any(value_descriptions):
value_descriptions = None
return {
"title": key_name,
"uriTemplate": href_template,
"generalized_datacube:href_template": href_template,
"rel": "child",
"type": "application/json",
"variables": {
key_name: {
"type": "string",
"description": mars_language.get(key_name, {}).get(
"description", ""
),
"enum": values,
"value_descriptions": value_descriptions,
# "paths": paths,
}
"generalized_datacube:dimension": {
"type": mars_language.get(key_name, {}).get("type", ""),
"description": mars_language.get(key_name, {}).get("description", ""),
"values": values,
"value_descriptions": value_descriptions,
"optional": any(optional),
"multiple": True,
"paths": paths,
},
}
@ -347,29 +242,20 @@ async def get_STAC(
"description": mars_language.get(key, {}).get("description", ""),
"value_descriptions": value_descriptions(key, values),
}
for key, values in request.items()
for key, values in request_dict.items()
}
# Format the response as a STAC collection
stac_collection = {
"type": "Catalog",
"type": "Collection",
"stac_version": "1.0.0",
"id": "root" if not request else "/stac?" + request_params,
"id": "partial-matches",
"description": "STAC collection representing potential children of this request",
"links": [make_link(p["key"], p["paths"], p["values"]) for p in paths],
"debug": {
# "request": request,
"request": request_dict,
"descriptions": descriptions,
# "paths": paths,
"qube": node_tree_to_html(
qube.compress(),
collapse=True,
depth=10,
include_css=False,
include_js=False,
max_summary_length=200,
css_id="qube",
),
"paths": paths,
},
}

View File

@ -1,5 +1,3 @@
fastapi[standard]
pe
redis
frozendict
requests

View File

@ -1,3 +1,3 @@
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
cd "$parent_path"
LOCAL_CACHE=True fastapi dev ./main.py --port 8124 --reload
LOCAL_CACHE=../config/climate-dt fastapi dev ./main.py --port 8124 --reload

View File

@ -1,3 +0,0 @@
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
cd "$parent_path"
sudo LOCAL_CACHE=True ../../.venv/bin/fastapi dev ./main.py --port 80 --host=0.0.0.0 --reload

View File

@ -1,50 +0,0 @@
pre#qube {
font-family: monospace;
white-space: pre;
font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;
font-size: 12px;
line-height: 1.4;
details {
margin-left: 0;
}
.qubed-level a {
margin-left: 10px;
text-decoration: none;
}
summary {
list-style: none;
cursor: pointer;
text-overflow: ellipsis;
overflow: hidden;
text-wrap: nowrap;
display: block;
}
span.qubed-node:hover {
background-color: #f0f0f0;
}
details > summary::after {
content: ' ▲';
}
details:not([open]) > summary::after {
content: " ▼";
}
.qubed-level {
text-overflow: ellipsis;
overflow: hidden;
text-wrap: nowrap;
display: block;
}
summary::-webkit-details-marker {
display: none;
content: "";
}
}

View File

@ -7,39 +7,18 @@
- **Extension [Maturity Classification](https://github.com/radiantearth/stac-spec/tree/master/extensions/README.md#extension-maturity):** Proposal
- **Owner**: @TomHodson
This STAC extension borrows the [Draft OGC Records API](https://docs.ogc.org/DRAFTS/20-004.html), specifically the [templated links section](https://docs.ogc.org/DRAFTS/20-004.html#sc_templated_links_with_variables) to give STAC the ability to index very large datasets that conform to a generalised datacube model.
This STAC extension allows for represention of [generalised datacubes][gen_datacubes].
A typical datacube has a fixed set of dimensions `[a, b, c..]` , each of which have a fixed span `{a: ["temp","rainfall"], b : [1-7], c:[True, False]}` such that we can access data by indexing, i.e providing a value for each axis, `a="rainfall", b=1, ...`. A generalized datacube, by our defintion, allow the dimensions to change during indexing, so choosing `a="rainfall"` might yield a different set of axes from `a="temp"`.
A datacube has a fixed set of dimensions `[a, b, c..]` , each of which have a fixed span `{a: ["temp","rainfall"], b : [1-7], c:[True, False]}` such that we can access data by indexing, i.e providing a value for each axis, `a="rainfall", b=1, ...`. A generalised datacubes allow the dimensions to change during indexing, so choosing `a="rainfall"` might yield a different set of axes from `a="temp"`.
The [STAC Datacube][datacube_extension] extension serves the needs of datacubes that appear in STAC as Items or Collections, i.e as leaves in the tree. This extension instead focussing on allowing STAC to serve as an interface to dynamically explore the branches of generalised datacubes. It does this by adding additional metadata from the OGC Records standard to the children of Catalog entries.
The [STAC Datacube][datacube_extension] extension serves the needs of datacubes that appear in STAC as Items or Collections, i.e as leaves in the tree. This extension instead focussing on allowing STAC to serve as an interface to dynamically explore the branches of generalised datacubes. It does this by adding additional metadata to the children of Catalog entries.
In practice, what this proposal does is:
We take the *Dimension Objects* defined by the [Datacube Extension][datacube_extension] and add them to [Link objects][link_objects] under the key `generalized_datacube:dimension`. This enables a single Link Object to represent a whole axis and its allowed values. Since `href` must now be constructed dynamically, we rempve it and add a `generalized_datacube:href_template` attribute to communicate how to construct the URLs corresponding to particular choice of value or values.
1. For child items that represent many distinct children, replace `"links":` with `"linkTemplates":` in the Catalog entry. (Following the example of the OGC Records API.)
2. For each `rel: Child` object in `linkTemplates`:
In order to support more complex slicing operations in which multiple indices may be selected for a given dimensions we also add additional optional attributes to all *Dimension Objects*, these are:
a. Add a `variables` key following the OGC Records API whose values is a dictionary with entries like
```json
"format": {
"type": "string",
"enum": [
"application/vnd.google-earth.kml+xml",
"application/vnd.google-earth.kmz",
"image/png",
"image/jpeg",
"image/gif",
"image/png; mode=8bit",
"application/x-pdf",
"image/svg+xml",
"image/tiff"
]
}
```
b. Add a "uriTemplate" key that specifies how to contruct the resulting URL: i.e `http://hostname.tld/app/index.html?class=od&format={format}`
This enables a child object to represent a whole axis and its allowed values. Since `href` must now be constructed dynamically, we rempve it and add a `generalized_datacube:href_template` attribute to communicate how to construct the URLs corresponding to particular choice of value or values.
* `optional` : bool whether this dimension can be skipped.
* `multiple` : boo wether multiple values can be selected for this key.
[gen_datacubes]: https://github.com/ecmwf/datacube-spec
[link_objects]: https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#link-object
@ -55,20 +34,19 @@ A typical `Catalog` entry with this extension:
"id": "rainfall",
"stac_version": "1.0.0",
"description": "ECMWF's Operational Data Archive",
"linkTemplates": [
"links": [
{
"rel": "child",
"title": "Expver - Experiment Version",
"uriTemplate": "http://hostname.tld/app/index.html?class=od&expver={expver}",
"generalized_datacube:href_template": "http://136.156.129.226/app/index.html?class=od&expver={}",
"rel": "child",
"type": "application/json",
"variables" : {
"expver" : {
"description": "Experiment version, 0001 selects operational data.",
"type" : "string",
"enum" : ["0001", "xxxx"],
"value_descriptions" : ["Operational Data", "Experimental Data"],
"optional" : false,
}
"generalized_datacube:dimension" : {
"type" : "enum",
"description": "Experiment version, 0001 selects operational data.",
"values" : ["0001", "xxxx"],
"value_descriptions" : ["Operational Data", "Experimental Data"],
"optional" : false,
"multiple": true,
}
""
@ -92,19 +70,120 @@ The fields in the table below can be used in these parts of STAC documents:
- [ ] Assets (for both Collections and Items, incl. Item Asset Definitions in Collections)
- [x] Links
| Field Name | Type | Description |
| -------------------- | ------------------------- | --------------------------------------------------------------------------------------------------------------------- |
| uriTemplate | URI Template | Of the form "http://hostname.tld/app/index.html?class=od&expver={expver}", follows OGC Records Spec for uriTemplates |
| variables | | |
| Field Name | Type | Description |
| -------------------- | ------------------------- | -------------------------------------------------------- |
| axis:dimension | Dimension Object | Whether the axis is an enum, date range, time range etc |
| axis:href_template | string | Whether the axis is an enum, date range, time range etc |
### Additional Field Information
#### uriTemplate
Todo
#### axis:dimension
#### variables
Todo
### Dimension Object
The dimension object reuses all those [defined by the datacube extension](https://github.com/stac-extensions/datacube#dimension-object), currently those are Horizontal Spatial Raster, Vertical Spatial, Temporal Dimension, Spatial Vector Dimension, Additional Dimension. They are reproduced below for reference.
These dimension objects are defined in addition:
### Enum Dimension Object
| Field Name | Type | Description |
| ---------------- | ----------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** `enum`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| values | \[number\|string] | An ordered list of all values, especially useful for [nominal](https://en.wikipedia.org/wiki/Level_of_measurement#Nominal_level) values. |
| value_descriptions | \[string] | Optionally provide a human readable description for each value. Useful if the values are codes that have defined meanings. |
| step | number\|null | If the dimension consists of [interval](https://en.wikipedia.org/wiki/Level_of_measurement#Interval_scale) values, the space between the values. Use `null` for irregularly spaced steps. |
| unit | string | The unit of measurement for the data, preferably compliant to [UDUNITS-2](https://ncics.org/portfolio/other-resources/udunits2/) units (singular). |
| reference_system | string | The reference system for the data. |
An Enum Dimension Object MUST specify `values`.
Dimension objects degined by the datacube extension:
### Horizontal Spatial Raster Dimension Object
A spatial raster dimension in one of the horizontal (x or y) directions.
| Field Name | Type | Description |
| ---------------- | -------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `spatial`. |
| axis | string | **REQUIRED.** Axis of the spatial raster dimension (`x`, `y`). |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[number] | **REQUIRED.** Extent (lower and upper bounds) of the dimension as two-element array. Open intervals with `null` are not allowed. |
| values | \[number] | Optionally, an ordered list of all values. |
| step | number\|null | The space between the values. Use `null` for irregularly spaced steps. |
| reference_system | string\|number\|object | The spatial reference system for the data, specified as [numerical EPSG code](http://www.epsg-registry.org/), [WKT2 (ISO 19162) string](http://docs.opengeospatial.org/is/18-010r7/18-010r7.html) or [PROJJSON object](https://proj.org/specifications/projjson.html). Defaults to EPSG code 4326. |
### Vertical Spatial Dimension Object
A spatial dimension in vertical (z) direction.
| Field Name | Type | Description |
| ---------------- | ---------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `spatial`. |
| axis | string | **REQUIRED.** Axis of the spatial dimension, always `z`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[number\|null\] | If the dimension consists of [ordinal](https://en.wikipedia.org/wiki/Level_of_measurement#Ordinal_scale) values, the extent (lower and upper bounds) of the values as two-element array. Use `null` for open intervals. |
| values | \[number\|string\] | An ordered list of all values, especially useful for [nominal](https://en.wikipedia.org/wiki/Level_of_measurement#Nominal_level) values. |
| step | number\|null | If the dimension consists of [interval](https://en.wikipedia.org/wiki/Level_of_measurement#Interval_scale) values, the space between the values. Use `null` for irregularly spaced steps. |
| unit | string | The unit of measurement for the data, preferably compliant to [UDUNITS-2](https://ncics.org/portfolio/other-resources/udunits2/) units (singular). |
| reference_system | string\|number\|object | The spatial reference system for the data, specified as [numerical EPSG code](http://www.epsg-registry.org/), [WKT2 (ISO 19162) string](http://docs.opengeospatial.org/is/18-010r7/18-010r7.html) or [PROJJSON object](https://proj.org/specifications/projjson.html). Defaults to EPSG code 4326. |
A Vertical Spatial Dimension Object MUST specify an `extent` or `values`. It MAY specify both.
### Temporal Dimension Object
A temporal dimension based on the ISO 8601 standard. The temporal reference system for the data is expected to be ISO 8601 compliant
(Gregorian calendar / UTC). Data not compliant with ISO 8601 can be represented as an *Additional Dimension Object* with `type` set to `temporal`.
| Field Name | Type | Description |
| ---------- | --------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `temporal`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[string\|null] | **REQUIRED.** Extent (lower and upper bounds) of the dimension as two-element array. The dates and/or times must be strings compliant to [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). `null` is allowed for open date ranges. |
| values | \[string] | If the dimension consists of an ordered list of specific values they can be listed here. The dates and/or times must be strings compliant to [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). |
| step | string\|null | The space between the temporal instances as [ISO 8601 duration](https://en.wikipedia.org/wiki/ISO_8601#Durations), e.g. `P1D`. Use `null` for irregularly spaced steps. |
### Spatial Vector Dimension Object
A vector dimension that defines a spatial dimension based on geometries.
| Field Name | Type | Description |
| ---------------- | -------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Type of the dimension, always `geometry`. |
| axes | \[string] | Axes of the vector dimension as an ordered set of `x`, `y` and `z`. Defaults to `x` and `y`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| bbox | \[number] | **REQUIRED.** A single bounding box of the geometries as defined for [STAC Collections](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#spatial-extent-object), but not nested. |
| values | \[string\] | Optionally, a representation of the geometries. This could be a list of WKT strings or other identifiers. |
| geometry_types | \[[GeoJSON Types](https://www.rfc-editor.org/rfc/rfc7946#section-1.4)] | A set of geometry types. If not present, mixed geometry types must be assumed. |
| reference_system | string\|number\|object | The spatial reference system for the data, specified as [numerical EPSG code](http://www.epsg-registry.org/), [WKT2 (ISO 19162) string](http://docs.opengeospatial.org/is/18-010r7/18-010r7.html) or [PROJJSON object](https://proj.org/specifications/projjson.html). Defaults to EPSG code 4326. |
For a general explanation what a vector datacube and a vector dimension is, please read the article "[Vector Data Cubes](https://r-spatial.org/r/2022/09/12/vdc.html)".
### Additional Dimension Object
An additional dimension that is not `spatial`, but may be `temporal` if the data is not compliant with ISO 8601 (see below).
| Field Name | Type | Description |
| ---------------- | ----------------- | ------------------------------------------------------------ |
| type | string | **REQUIRED.** Custom type of the dimension, never `spatial` or `geometry`. |
| description | string | Detailed multi-line description to explain the dimension. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
| extent | \[number\|null] | If the dimension consists of [ordinal](https://en.wikipedia.org/wiki/Level_of_measurement#Ordinal_scale) values, the extent (lower and upper bounds) of the values as two-element array. Use `null` for open intervals. |
| values | \[number\|string] | An ordered list of all values, especially useful for [nominal](https://en.wikipedia.org/wiki/Level_of_measurement#Nominal_level) values. |
| step | number\|null | If the dimension consists of [interval](https://en.wikipedia.org/wiki/Level_of_measurement#Interval_scale) values, the space between the values. Use `null` for irregularly spaced steps. |
| unit | string | The unit of measurement for the data, preferably compliant to [UDUNITS-2](https://ncics.org/portfolio/other-resources/udunits2/) units (singular). |
| reference_system | string | The reference system for the data. |
An Additional Dimension Object MUST specify an `extent` or `values`. It MAY specify both.
Note on "Additional Dimension" with type `temporal`:
You can distinguish the "Temporal Dimension" from an "Additional Dimension" by checking whether the extent exists and contains strings.
So if the `type` equals `temporal` and `extent` is an array of strings/null, then you have a "Temporal Dimension",
otherwise you have an "Additional Dimension".

View File

@ -1,81 +0,0 @@
from __future__ import annotations
from datetime import datetime
from typing import Sequence
from qubed.rust import Qube as rsQube
# q = pyQube.from_tree("""
# root, class=d1
# ├── dataset=another-value, generation=1/2/3
# └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """)
# json_str = json.dumps(q.to_json())
# rust_qube = Qube.from_json(json_str)
# # print(repr(rust_qube))
# # print(json_str)
# expected = """root, class=d1
# ├── dataset=another-value, generation=1/2/3
# └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """
# assert repr(rust_qube) == expected
# # print(rs_qube._repr_html_())
# print(q | q)
value = str | int | float | datetime
class Qube(rsQube):
@classmethod
def empty(cls):
q = cls()
print(f"empty called {cls = } {q = }")
return q
@classmethod
def from_datacube(cls, datacube: dict[str, value | Sequence[value]]) -> Qube:
qube = cls.empty()
(key, values), *key_vals = list(datacube.items())
node = qube.add_node(qube.root, key, values)
for key, values in key_vals:
node = qube.add_node(parent=node, key=key, values=values)
return qube
@classmethod
def from_dict(cls, d: dict) -> Qube:
q = cls.empty()
def from_dict(parent, d: dict):
for k, children in d.items():
key, values = k.split("=")
values = values.split("/")
node = q.add_node(
parent=parent,
key=key,
values=values,
)
from_dict(parent=node, d=children)
from_dict(q.root, d)
return q
q = Qube.from_datacube({"a": ["4"], "b": "test", "c": ["1", "2", "3"]})
print(q)
print(repr(q))
q = Qube.from_dict(
{
"a=2/3": {"b=1": {}},
"a2=a/b": {"b2=1/2": {}},
}
)
print(q)
print(repr(q))

View File

@ -1,99 +0,0 @@
# Example script for ingesting data from an fdb into a qube
# Notes
# Uses fdb --compact
# Splits by data in order to avoid out of memory problems with fdb --compact
# Does a bit of processing like removing "year" and "month" keys
# Might want to add datatypes and reordering of keys there too
import json
import subprocess
from datetime import datetime, timedelta
from time import time
import psutil
from qubed import Qube
from tqdm import tqdm
import requests
process = psutil.Process()
CHUNK_SIZE = timedelta(days=60)
FILEPATH = "tests/example_qubes/full_dt.json"
API = "https://qubed.lumi.apps.dte.destination-earth.eu/api/v1"
with open("config/api.secret", "r") as f:
secret = f.read()
def ecmwf_date(d):
return d.strftime("%Y%m%d")
start_date = datetime.now() - timedelta(days=120)
# start_date = datetime(1990, 1, 1)
# end_date = datetime.now()
end_date = datetime(2026, 1, 1)
current_span = [end_date - CHUNK_SIZE, end_date]
try:
qube = Qube.load(FILEPATH)
except:
qube = Qube.empty()
while current_span[0] > start_date:
for config in ["config/config-climate-dt.yaml", "config/config-extremes-dt.yaml"]:
t0 = time()
start, end = map(ecmwf_date, current_span)
print(f"Doing {config} {current_span[0].date()} - {current_span[1].date()}")
print(f"Current memory usage: {process.memory_info().rss / 1e9:.2g}GB")
print(f"{qube.n_nodes = }, {qube.n_leaves = },")
subqube = Qube.empty()
command = [
f"fdb list --compact --config {config} --minimum-keys=date class=d1,date={start}/{end}"
]
try:
p = subprocess.run(
command,
text=True,
shell=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
check=True,
)
except Exception as e:
print(f"Failed for {current_span} {e}")
continue
print("Got compact list")
for i, line in tqdm(enumerate(list(p.stdout.split("\n")))):
if not line.startswith("retrieve,class="):
continue
def split(t):
return t[0], t[1].split("/")
# Could do datatypes here
request = dict(split(v.split("=")) for v in line.strip().split(",")[1:])
request.pop("year", None)
request.pop("month", None)
# Could do things like date = year + month + day
q = Qube.from_datacube(request)
subqube = subqube | q
print("added to qube")
qube = qube | subqube
subqube.print(depth=2)
print(f"{subqube.n_nodes = }, {subqube.n_leaves = },")
requests.post(
API + "/union/climate-dt/",
headers = {"Authorization" : f"Bearer {secret}"},
json = subqube.to_json())
current_span = [current_span[0] - CHUNK_SIZE, current_span[0]]
print(
f"Did that taking {(time() - t0) / CHUNK_SIZE.days:2g} seconds per day ingested, total {(time() - t0):2g}s"
)
with open(FILEPATH, "w") as f:
json.dump(qube.to_json(), f)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,40 +1,36 @@
from qubed import Qube
q = Qube.from_tree("""
root
class=od
expver=0001
param=1
param=2
expver=0002
param=1
param=2
class=rd
expver=0001
param=1
param=2
param=3
expver=0002
param=1
param=2
""")
d = {
"class=od": {
"expver=0001": {"param=1": {}, "param=2": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
},
"class=rd": {
"expver=0001": {"param=1": {}, "param=2": {}, "param=3": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
},
}
q = Qube.from_dict(d)
def test_eq():
r = Qube.from_dict(d)
assert q == r
def test_getitem():
assert q["class", "od"] == Qube.from_tree("""
root
expver=0001
param=1
param=2
expver=0002
param=1
param=2
""")
assert q["class", "od"]["expver", "0001"] == Qube.from_tree("""
root
param=1
param=2""")
assert q["class", "od"] == Qube.from_dict(
{
"expver=0001": {"param=1": {}, "param=2": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
}
)
assert q["class", "od"]["expver", "0001"] == Qube.from_dict(
{
"param=1": {},
"param=2": {},
}
)
def test_n_leaves():

View File

@ -15,62 +15,19 @@ def test_smoke():
}
)
ct = Qube.from_tree("""
root
class=od, expver=0001/0002, param=1/2
class=rd
expver=0001, param=1/2/3
expver=0002, param=1/2
""")
# root
# ├── class=od, expver=0001/0002, param=1/2
# └── class=rd
# ├── expver=0001, param=1/2/3
# └── expver=0002, param=1/2
ct = Qube.from_dict(
{
"class=od": {"expver=0001/0002": {"param=1/2": {}}},
"class=rd": {
"expver=0001": {"param=1/2/3": {}},
"expver=0002": {"param=1/2": {}},
},
}
)
assert q.compress() == ct
def test_2():
qube = Qube.from_dict(
{
"class=d1": {
"generation=1": {
"date=20240728": {"time=0600": {"param=8/78/79": {}}},
"date=20240828": {"time=0600": {"param=8/78/79": {}}},
"date=20240928": {"time=0600": {"param=8/78/79": {}}},
}
}
}
)
target = Qube.from_datacube(
{
"class": "d1",
"generation": "1",
"date": ["20240728", "20240828", "20240928"],
"time": "0600",
"param": ["8", "78", "79"],
}
)
assert qube.compress() == target
def test_removal_compression():
qube = Qube.from_dict(
{
"class=d1": {
"generation=1": {
"month=07": {"date=20240728": {"time=0600": {"param=8/78/79": {}}}},
"month=08": {"date=20240828": {"time=0600": {"param=8/78/79": {}}}},
"month=09": {"date=20240928": {"time=0600": {"param=8/78/79": {}}}},
}
}
}
)
target = Qube.from_datacube(
{
"class": "d1",
"generation": "1",
"date": ["20240728", "20240828", "20240928"],
"time": "0600",
"param": ["8", "78", "79"],
}
)
assert qube.remove_by_key(["month"]) == target

View File

@ -2,7 +2,7 @@ from qubed import Qube
def test_json_round_trip():
from_dict = Qube.from_dict(
u = Qube.from_dict(
{
"class=d1": {
"dataset=climate-dt/weather-dt": {
@ -14,54 +14,5 @@ def test_json_round_trip():
}
}
)
from_tree = Qube.from_tree("""
root, class=d1
dataset=another-value, generation=1/2/3
dataset=climate-dt/weather-dt, generation=1/2/3/4
""")
from_json = Qube.from_json(
{
"key": "root",
"values": ["root"],
"metadata": {},
"children": [
{
"key": "class",
"values": ["d1"],
"metadata": {},
"children": [
{
"key": "dataset",
"values": ["another-value"],
"metadata": {},
"children": [
{
"key": "generation",
"values": ["1", "2", "3"],
"metadata": {},
"children": [],
}
],
},
{
"key": "dataset",
"values": ["climate-dt", "weather-dt"],
"metadata": {},
"children": [
{
"key": "generation",
"values": ["1", "2", "3", "4"],
"metadata": {},
"children": [],
}
],
},
],
}
],
}
)
assert from_tree == from_json
assert from_tree == from_dict
json = u.to_json()
assert Qube.from_json(json) == u

View File

@ -20,6 +20,14 @@ root
expver=0002, param=1/2
""".strip()
as_html = """
<details open data-path="root"><summary class="qubed-node">root</summary><span class="qubed-node leaf" data-path="class=od,expver=0001/0002,param=1/2"> class=od, expver=0001/0002, param=1/2</span><details open data-path="class=rd"><summary class="qubed-node"> class=rd</summary><span class="qubed-node leaf" data-path="expver=0001,param=1/2/3"> expver=0001, param=1/2/3</span><span class="qubed-node leaf" data-path="expver=0002,param=1/2"> expver=0002, param=1/2</span></details></details>
""".strip()
def test_string():
assert str(q).strip() == as_string
def test_html():
assert as_html in q._repr_html_()

View File

@ -16,29 +16,3 @@ def test_iter_leaves_simple():
]
assert set(make_hashable(q.leaves())) == set(make_hashable(entries))
def test_datacubes():
q = Qube.from_tree("""
root, class=d1
date=19920101/19930101/19940101, params=1/2/3
date=19950101
level=1/2/3, params=1/2/3/4
params=1/2/3/4
""")
assert len(list(q.datacubes())) == 3
assert list(q.datacubes()) == [
{
"class": ["d1"],
"date": ["19920101", "19930101", "19940101"],
"params": ["1", "2", "3"],
},
{
"class": ["d1"],
"date": ["19950101"],
"level": ["1", "2", "3"],
"params": ["1", "2", "3", "4"],
},
{"class": ["d1"], "date": ["19950101"], "params": ["1", "2", "3", "4"]},
]

View File

@ -1,98 +0,0 @@
# from frozendict import frozendict
# from qubed import Qube
# def make_set(entries):
# return set((frozendict(a), frozendict(b)) for a, b in entries)
# def construction():
# q = Qube.from_nodes(
# {
# "class": dict(values=["od", "rd"]),
# "expver": dict(values=[1, 2]),
# "stream": dict(
# values=["a", "b", "c"], metadata=dict(number=list(range(12)))
# ),
# }
# )
# assert make_set(q.leaves_with_metadata()) == make_set([
# ({'class': 'od', 'expver': 1, 'stream': 'a'}, {'number': 0}),
# ({'class': 'od', 'expver': 1, 'stream': 'b'}, {'number': 1}),
# ({'class': 'od', 'expver': 1, 'stream': 'c'}, {'number': 2}),
# ({'class': 'od', 'expver': 2, 'stream': 'a'}, {'number': 3}),
# ({'class': 'od', 'expver': 2, 'stream': 'b'}, {'number': 4}),
# ({'class': 'od', 'expver': 2, 'stream': 'c'}, {'number': 5}),
# ({'class': 'rd', 'expver': 1, 'stream': 'a'}, {'number': 6}),
# ({'class': 'rd', 'expver': 1, 'stream': 'b'}, {'number': 7}),
# ({'class': 'rd', 'expver': 1, 'stream': 'c'}, {'number': 8}),
# ({'class': 'rd', 'expver': 2, 'stream': 'a'}, {'number': 9}),
# ({'class': 'rd', 'expver': 2, 'stream': 'b'}, {'number': 10}),
# ({'class': 'rd', 'expver': 2, 'stream': 'c'}, {'number': 11})])
# def test_simple_union():
# q = Qube.from_nodes(
# {
# "class": dict(values=["od", "rd"]),
# "expver": dict(values=[1, 2]),
# "stream": dict(
# values=["a", "b", "c"], metadata=dict(number=list(range(12)))
# ),
# }
# )
# r = Qube.from_nodes(
# {
# "class": dict(values=["xd"]),
# "expver": dict(values=[1, 2]),
# "stream": dict(
# values=["a", "b", "c"], metadata=dict(number=list(range(12, 18)))
# ),
# }
# )
# expected_union = Qube.from_nodes(
# {
# "class": dict(values=["od", "rd", "xd"]),
# "expver": dict(values=[1, 2]),
# "stream": dict(
# values=["a", "b", "c"], metadata=dict(number=list(range(18)))
# ),
# }
# )
# union = q | r
# assert union == expected_union
# assert make_set(expected_union.leaves_with_metadata()) == make_set(
# union.leaves_with_metadata()
# )
# def test_construction_from_fdb():
# import json
# paths = {}
# current_path = None
# i = 0
# qube = Qube.empty()
# with open("tests/data/climate_dt_paths.json") as f:
# for l in f.readlines():
# i += 1
# j = json.loads(l)
# if "type" in j and j["type"] == "path":
# paths[j["i"]] = j["path"]
# else:
# request = j.pop("keys")
# metadata = j
# # print(request, metadata)
# q = Qube.from_nodes({
# key : dict(values = [value])
# for key, value in request.items()
# }).add_metadata(**metadata)
# qube = qube | q
# if i > 100: break

View File

@ -1,12 +0,0 @@
from qubed import Qube
def test_protobuf_simple():
q = Qube.from_tree("""
root, class=d1
dataset=another-value, generation=1/2/3
dataset=climate-dt/weather-dt, generation=1/2/3/4
""")
wire = q.to_protobuf()
round_trip = Qube.from_protobuf(wire)
assert round_trip == q

View File

@ -1,21 +1,5 @@
from __future__ import annotations
from qubed.rust import hello
from qubed.rust import Qube as Qube
# def test_from_json():
# q = pyQube.from_tree("""
# root, class=d1
# ├── dataset=another-value, generation=1/2/3
# └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """)
# json_str = json.dumps(q.to_json())
# rust_qube = Qube.from_json(json_str)
# print(repr(rust_qube))
# expected = """root, class=d1
# ├── dataset=another-value, generation=1/2/3
# └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """
# assert repr(rust_qube) == expected
def test_hello():
assert hello("World") == "Hello, World!"

View File

@ -1,24 +1,29 @@
from qubed import Qube
q = Qube.from_tree("""
root
class=od, expver=0001/0002, param=1/2
class=rd, param=1/2/3
""")
q = Qube.from_dict(
{
"class=od": {
"expver=0001": {"param=1": {}, "param=2": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
},
"class=rd": {"param=1": {}, "param=2": {}, "param=3": {}},
}
)
def test_consumption():
assert q.select({"expver": "0001"}, consume=True) == Qube.from_tree(
"root, class=od, expver=0001, param=1/2"
assert q.select({"expver": "0001"}, consume=True) == Qube.from_dict(
{"class=od": {"expver=0001": {"param=1": {}, "param=2": {}}}}
)
def test_consumption_off():
expected = Qube.from_tree("""
root
class=od, expver=0001, param=1/2
class=rd, param=1/2/3
""")
expected = Qube.from_dict(
{
"class=od": {"expver=0001": {"param=1": {}, "param=2": {}}},
"class=rd": {"param=1": {}, "param=2": {}, "param=3": {}},
}
)
assert q.select({"expver": "0001"}, consume=False) == expected

View File

@ -1,127 +1,6 @@
from qubed import Qube
def set_operation_testcase(name, testcase):
q1 = Qube.from_tree(testcase["q1"])
q2 = Qube.from_tree(testcase["q2"])
assert q1 | q2 == Qube.from_tree(testcase["union"]), (
f"Case: {name} Op: Union\n {q1 = }\n {q2 = }\n {q1 | q2 = }\n expected = {testcase['union']}\n"
)
assert q1 & q2 == Qube.from_tree(testcase["intersection"]), (
f"Case: {name} Op: Intersection\n {q1 = }\n {q2 = }\n {q1 - q2 = }\n expected = {testcase['intersection']}\n"
)
assert q1 - q2 == Qube.from_tree(testcase["difference"]), (
f"Case: {name} Op: Difference\n {q1 = }\n {q2 = }\n {q1 - q2 = }\n expected = {testcase['difference']}\n"
)
# These are a bunch of testcases where q1 and q2 are specified and then their union/intersection/difference are checked
# Generate them with code like:
# q1 = Qube.from_tree("root, frequency=*, levtype=*, param=*, levelist=*, domain=a/b/c/d")
# q2 = Qube.from_tree("root, frequency=*, levtype=*, param=*, domain=a/b/c/d")
# test = {
# "q1": str(q1),
# "q2": str(q2),
# "union": str(q1 | q2),
# "intersection": str(q1 & q2),
# "difference": str(q1 - q2),
# }
# BUT MANUALLY CHECK THE OUTPUT BEFORE ADDING IT AS A TEST CASE!
testcases = {
"Simplest case, only leaves differ": {
"q1": "root, a=1, b=1, c=1",
"q2": "root, a=1, b=1, c=2",
"union": "root, a=1, b=1, c=1/2",
"intersection": "root",
"difference": "root, a=1, b=1, c=1",
},
"Some overlap but also each tree has unique items": {
"q1": "root, a=1, b=1, c=1/2/3",
"q2": "root, a=1, b=1, c=2/3/4",
"union": "root, a=1, b=1, c=1/2/3/4",
"intersection": "root, a=1, b=1, c=2/3",
"difference": "root, a=1, b=1, c=1",
},
"Overlap at two levels": {
"q1": "root, a=1, b=1/2, c=1/2/3",
"q2": "root, a=1, b=2/3, c=2/3/4",
"union": """
root, a=1
b=1, c=1/2/3
b=2, c=1/2/3/4
b=3, c=2/3/4
""",
"intersection": "root, a=1, b=2, c=2/3",
"difference": """
root, a=1
b=1, c=1/2/3
b=2, c=1""",
},
"Simple difference": {
"q1": "root, a=1, b=1, c=1/2/3",
"q2": "root, a=1, b=1, c=2",
"union": "root, a=1, b=1, c=1/2/3",
"intersection": "root, a=1, b=1, c=2",
"difference": "root, a=1, b=1, c=1/3",
},
"Check that we can merge even if the divergence point is higher": {
"q1": "root, a=1, b=1, c=1",
"q2": "root, a=2, b=1, c=1",
"union": "root, a=1/2, b=1, c=1",
"intersection": "root",
"difference": "root, a=1, b=1, c=1",
},
"Two equal qubes": {
"q1": "root, a=1, b=1, c=1",
"q2": "root, a=1, b=1, c=1",
"union": "root, a=1, b=1, c=1",
"intersection": "root, a=1, b=1, c=1",
"difference": "root",
},
"Two qubes that don't compress on their own but the union does": {
"q1": """
root
a=1/3, b=1
a=2, b=1/2
""",
"q2": "root, a=1/3, b=2",
"union": "root, a=1/2/3, b=1/2",
"intersection": "root",
"difference": """
root
a=1/3, b=1
a=2, b=1/2
""",
},
"With wildcards": {
"q1": "root, frequency=*, levtype=*, param=*, levelist=*, domain=a/b/c/d",
"q2": "root, frequency=*, levtype=*, param=*, domain=a/b/c/d",
"union": """
root, frequency=*, levtype=*, param=*
domain=a/b/c/d
levelist=*, domain=a/b/c/d
""",
"intersection": "root",
"difference": "root, frequency=*, levtype=*, param=*, levelist=*, domain=a/b/c/d",
},
"Merging wildcard groups": {
"q1": "root, levtype=pl, param=q, levelist=100/1000, quantile=*",
"q2": "root, levtype=pl, param=t, levelist=100/1000, quantile=*",
"union": "root, levtype=pl, param=q/t, levelist=100/1000, quantile=*",
"intersection": "root",
"difference": "root, levtype=pl, param=q, levelist=100/1000, quantile=*",
},
}
def test_cases():
for name, case in testcases.items():
set_operation_testcase(name, case)
def test_leaf_conservation():
q = Qube.from_dict(
{

View File

@ -1,12 +1,17 @@
from qubed import Qube
q = Qube.from_tree("""
root
class=od, expver=0001/0002, param=1/2
class=rd
expver=0001, param=1/2/3
expver=0002, param=1/2
""")
q = Qube.from_dict(
{
"class=od": {
"expver=0001": {"param=1": {}, "param=2": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
},
"class=rd": {
"expver=0001": {"param=1": {}, "param=2": {}, "param=3": {}},
"expver=0002": {"param=1": {}, "param=2": {}},
},
}
)
wild_datacube = {
"class": "*",
@ -29,16 +34,3 @@ def test_intersection():
},
}
)
def test_wildcard_union():
q1 = Qube.from_tree(
"root, frequency=*, levtype=*, param=*, levelist=*, domain=a/b/c/d"
)
q2 = Qube.from_tree("root, frequency=*, levtype=*, param=*, domain=a/b/c/d")
expected = Qube.from_tree("""
root, frequency=*, levtype=*, param=*
domain=a/b/c/d
levelist=*, domain=a/b/c/d
""")
assert (q1 | q2) == expected

1
web_query_builder/.env Normal file
View File

@ -0,0 +1 @@
API_HOST=localhost:8124

33
web_query_builder/app.py Normal file
View File

@ -0,0 +1,33 @@
import os
from flask import (
Flask,
render_template,
request,
)
from flask_cors import CORS
from werkzeug.middleware.proxy_fix import ProxyFix
app = Flask(__name__)
CORS(app, resources={r"/api/*": {"origins": "*"}})
# This is required because when running in k8s the flask server sits behind a TLS proxy
# So flask speaks http while the client speaks https
# Client <-- https ---> Proxy <---- http ---> Flask server
# For the Oauth flow, flask needs to provide a callback url and it needs to use the right scheme=https
# This line tells flask to look at HTTP headers set by the TLS proxy to figure out what the original
# Traffic looked like.
# See https://flask.palletsprojects.com/en/3.0.x/deploying/proxy_fix/
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_prefix=1)
config = {}
@app.route("/")
def index():
return render_template(
"index.html",
request=request,
config=config,
api_url=os.environ.get("API_URL", "/api/stac"),
)

View File

@ -0,0 +1,8 @@
flask==3
pyyaml
flask_dance
python-dotenv
flask-login
flask-cors
cachetools
uvicorn

2
web_query_builder/run.sh Executable file
View File

@ -0,0 +1,2 @@
export API_URL="http://127.0.0.1:8124/api/stac"
flask run --debug --port=5006

View File

@ -138,23 +138,39 @@ async function createCatalogItem(link, itemsContainer) {
// Update the item div with real content
itemDiv.classList.remove("loading");
const variables = link["variables"];
const key = Object.keys(variables)[0];
const variable = variables[key];
const dimension = link["generalized_datacube:dimension"];
// add data-key attribute to the itemDiv
itemDiv.dataset.key = link.title;
itemDiv.dataset.keyType = variable.type;
itemDiv.dataset.keyType = dimension.type;
itemDiv.innerHTML = `
<h3 class="item-title">${link.title || "No title available"}</h3>
<p class="item-type">Key Type: ${itemDiv.dataset.keyType || "Unknown"}</p>
<!-- <p class="item-type">Paths: ${dimension.paths}</p> -->
<p class="item-type">Optional: ${dimension.optional ? "Yes" : "No"}</p>
<p class="item-description">${
variable.description ? variable.description.slice(0, 100) : ""
}</p>
dimension.description
? dimension.description.slice(0, 100)
: "No description available"
}...</p>
`;
if (variable.enum && variable.enum.length > 0) {
// if (dimension.type === "date" || dimension.type === "time") {
// // Render a date picker for the "date" key
// const picker = `<input type="${link.title}" name="${link.title}">`;
// //convert picker to HTML node
// const pickerNode = document
// .createRange()
// .createContextualFragment(picker);
// itemDiv.appendChild(pickerNode);
// }
// Otherwise create a scrollable list with checkboxes for values if available
if (
// dimension.type === "enum" &&
dimension.values &&
dimension.values.length > 0
) {
const listContainer = renderCheckboxList(link);
itemDiv.appendChild(listContainer);
} else {
@ -169,15 +185,14 @@ async function createCatalogItem(link, itemsContainer) {
}
function renderCheckboxList(link) {
const variables = link["variables"];
const key = Object.keys(variables)[0];
const variable = variables[key];
const value_descriptions = variable.value_descriptions || [];
const dimension = link["generalized_datacube:dimension"];
const value_descriptions = dimension.value_descriptions || [];
const listContainerHTML = `
<div class="item-list-container">
<label class="list-label">Select one or more values:</label>
<div class="scrollable-list">
${variable.enum
${dimension.values
.map((value, index) => {
const labelText = value_descriptions[index]
? `${value} - ${value_descriptions[index]}`
@ -186,7 +201,7 @@ function renderCheckboxList(link) {
<div class="checkbox-container">
<label class="checkbox-label">
<input type="checkbox" class="item-checkbox" value="${value}" ${
variable.enum.length === 1 ? "checked" : ""
dimension.values.length === 1 ? "checked" : ""
}>
${labelText}
</label>
@ -253,10 +268,8 @@ function renderRawSTACResponse(catalog) {
itemDetails.textContent = JSON.stringify(just_stac, null, 2);
const debug_container = document.getElementById("debug");
// create new object without debug key
debug_container.textContent = JSON.stringify(catalog.debug, null, 2);
const qube_container = document.getElementById("qube");
qube_container.innerHTML = catalog.debug.qube;
}
// Fetch STAC catalog and display items
@ -280,7 +293,6 @@ async function fetchCatalog(request, stacUrl) {
// Highlight the request and raw STAC
hljs.highlightElement(document.getElementById("raw-stac"));
hljs.highlightElement(document.getElementById("debug"));
hljs.highlightElement(document.getElementById("example-python"));
} catch (error) {
console.error("Error fetching STAC catalog:", error);
}

View File

@ -2,9 +2,6 @@ html,
body {
min-height: 100vh;
height: 100%;
--accent-color: #003399;
--background-grey: #f4f4f4;
}
body {
@ -26,7 +23,7 @@ body {
width: 30%;
padding: 10px;
overflow-y: scroll;
background-color: var(--background-grey);
background-color: #f4f4f4;
border-right: 1px solid #ddd;
}
@ -48,9 +45,7 @@ body {
}
.sidebar-header button {
width: 7em;
height: 2em;
padding: 0;
width: 10em;
}
canvas {
@ -68,7 +63,6 @@ canvas {
margin-bottom: 10px;
border-radius: 5px;
transition: background-color 0.2s ease;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
}
.item-title {
@ -97,8 +91,10 @@ canvas {
}
.item.selected {
background-color: var(--background-grey);
border-color: var(--accent-color);
background-color: #d4e9ff;
/* Lighter blue for selection */
border-color: #003399;
/* Keep the original ECMWF blue for the border */
}
summary h2 {
@ -121,7 +117,7 @@ button {
/* Padding around button text */
margin: 0 5px;
/* Margin between buttons */
background-color: var(--accent-color);
background-color: #003399;
/* ECMWF blue */
color: white;
/* White text color */
@ -142,6 +138,7 @@ button:hover {
.item-list-container {
margin-top: 20px;
margin-bottom: 20px;
}
.scrollable-list {
@ -151,6 +148,7 @@ button:hover {
border: 1px solid #ccc;
border-radius: 4px;
background-color: #fff;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
}
.checkbox-container {
@ -170,14 +168,14 @@ button:hover {
}
.checkbox-container:hover .checkbox-label {
color: var(--accent-color);
color: #003399;
}
.list-label {
font-weight: bold;
margin-bottom: 0.5em;
display: block;
color: var(--accent-color);
color: #003399;
}
span.key,
@ -211,7 +209,3 @@ span.value:hover {
width: 100%;
}
}
details h2 {
font-size: medium;
}

View File

@ -5,7 +5,6 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ECMWF DestinE STAC Viewer</title>
<link rel="stylesheet" href="/static/styles.css" />
<link rel="stylesheet" href="/static/qube_styles.css" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github.min.css">
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/json.min.js"></script>
@ -39,25 +38,8 @@
}
</code></pre>
<!-- Container to show the current tree -->
<h2>Currently Selected Tree</h2></summary>
<p>This shows the data <a href="https://qubed.readthedocs.io/en/latest/quickstart.html">qube</a> that matches with the current query. The leaves are the next set if available selections you can make. </p>
<pre id = "qube"></pre>
<details>
<summary><h2>Example Qubed Code</h2></summary>
See the <a href="https://qubed.readthedocs.io/en/latest/">Qubed documentation</a> for more details.
<pre><code id="example-python" class="language-python">
# pip install qubed requests
import requests
from qubed import Qube
qube = Qube.from_json(requests.get("{{ api_url }}select/climate-dt/?{{request.url.query}}").json())
qube.print()
</code></pre>
</details>
<!-- Container fo the raw STAC response -->
<details>
<details open>
<summary><h2>Raw STAC Response</h2></summary>
<p>See the <a href="https://github.com/ecmwf-projects/catalogs/blob/main/structured_stac.md">STAC Extension Proposal</a> for more details on the format.</p>
<pre class="json-pre"><code id="raw-stac" class="language-json"></code></pre>
@ -72,7 +54,7 @@ qube.print()
</div>
<script>
window.API_URL = "{{ api_url }}stac/climate-dt/";
window.API_URL = "{{ api_url }}";
</script>
<script src="/static/app.js"></script>
</body>