update docs

2025-03-27 18:30:12 +00:00 · 2025-03-27 18:30:12 +00:00 · 2884f9fff8
commit 2884f9fff8
parent df5360f29a
6 changed files with 27 additions and 503 deletions
--- a/docs/api.md
+++ b/docs/api.md
@ -1,13 +0,0 @@
-# API
-
-## Set Operations
-
-```{code-cell} python3
-from qubed import Qube
-
-A = Qube.from_dict({
-    "a=1": {"b": {1, 2, 3}, "c": {1}},
-    "a=2": {"b": {1, 2, 3}, "c": {1}},
-})
-A
-```
--- a/docs/cmd.md
+++ b/docs/cmd.md
@ -2,21 +2,18 @@

 ```bash
 fdb list class=rd,expver=0001,... | qubed --from=fdblist --to=text
+
+fdb list --minimum-keys=class class=d1,dataset=climate-dt --config prod_remoteFDB.yaml  | qubed convert --from=fdb --to=text
+
 ```

 `--from` options include:
-* `fdblist`
-* `json`
-* `protobuf`
-* `marslist`
-* `constraints`
+* `fdb`

 `--to` options include:
 * `text`
 * `html`
 * `json`
-* `datacubes`
-* `constraints`

 use `--input` and `--output` to specify input and output files respectively.

@ -27,3 +24,15 @@ gzip -dc tests/data/fdb_list_compact.gz| qubed convert --from=fdb --to=text --ou
 gzip -dc tests/data/fdb_list_porcelain.gz| qubed convert --from=fdb --to=json --output=qube.json
 gzip -dc tests/data/fdb_list_compact.gz | qubed convert --from=fdb --to=html --output=qube.html
 ```
+
+## Todo
+
+--from for
+* `protobuf`
+* `marslist`
+* `constraints`
+
+--to for
+* `json`
+* `datacubes`
+* `constraints`
--- a/docs/development.md
+++ b/docs/development.md
@ -1,8 +0,0 @@
-# Development
-
-To build the develop branch from source install a rust toolchain and pip install maturin then run:
-```
-git clone -b develop git@github.com:ecmwf/qubed.git
-cd qubed
-maturin develop
-```
--- a/docs/index.md
+++ b/docs/index.md
@ -11,12 +11,11 @@ jupytext:

 ```{toctree}
 :maxdepth: 1
-background.md
 quickstart.md
-api.md
-development.md
+background.md
 algorithms.md
 fiab.md
+cmd.md
 ```

 Qubed provides a datastructure called a Qube which represents sets of data identified by multiple key value pairs as a tree of datacubes. To understand what that means go to [Background](background.md), to just start using the library skip straight to the [Quickstart](quickstart.md).
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@ -18,6 +18,15 @@ Or to build and install the latest version from github (requires cargo):
 pip install qubed@git+https://github.com/ecmwf/qubed.git@main
 ```

+## Development
+
+To build the develop branch from source install a rust toolchain and pip install maturin then run:
+```
+git clone -b develop git@github.com:ecmwf/qubed.git
+cd qubed
+maturin develop
+```
+
 ## Usage
 Make an uncompressed qube:

--- a/notebooks/DataCubeTree.py
+++ b/notebooks/DataCubeTree.py
@ -1,472 +0,0 @@
-import dataclasses
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from datetime import date, datetime, timedelta
-from typing import Any, Callable, Iterable, Literal
-
-
-@dataclass(frozen=True)
-class HTML:
-    html: str
-
-    def _repr_html_(self):
-        return self.html
-
-
-@dataclass(frozen=True)
-class Values(ABC):
-    @abstractmethod
-    def summary(self) -> str:
-        pass
-
-    @abstractmethod
-    def __len__(self) -> int:
-        pass
-
-    @abstractmethod
-    def __contains__(self, value: Any) -> bool:
-        pass
-
-    @abstractmethod
-    def from_strings(self, values: list[str]) -> list["Values"]:
-        pass
-
-
-@dataclass(frozen=True)
-class Enum(Values):
-    """
-    The simplest kind of key value is just a list of strings.
-    summary -> string1/string2/string....
-    """
-
-    values: list[Any]
-
-    def __len__(self) -> int:
-        return len(self.values)
-
-    def summary(self) -> str:
-        return "/".join(sorted(self.values))
-
-    def __contains__(self, value: Any) -> bool:
-        return value in self.values
-
-    def from_strings(self, values: list[str]) -> list["Values"]:
-        return [Enum(values)]
-
-
-@dataclass(frozen=True)
-class Range(Values, ABC):
-    dtype: str = dataclasses.field(kw_only=True)
-
-
-@dataclass(frozen=True)
-class DateRange(Range):
-    start: date
-    end: date
-    step: timedelta
-    dtype: Literal["date"] = dataclasses.field(kw_only=True, default="date")
-
-    @classmethod
-    def from_strings(self, values: list[str]) -> list["DateRange"]:
-        dates = sorted([datetime.strptime(v, "%Y%m%d") for v in values])
-        if len(dates) < 2:
-            return [DateRange(start=dates[0], end=dates[0], step=timedelta(days=0))]
-
-        ranges = []
-        current_range, dates = (
-            [
-                dates[0],
-            ],
-            dates[1:],
-        )
-        while len(dates) > 1:
-            if dates[0] - current_range[-1] == timedelta(days=1):
-                current_range.append(dates.pop(0))
-
-            elif len(current_range) == 1:
-                ranges.append(
-                    DateRange(
-                        start=current_range[0],
-                        end=current_range[0],
-                        step=timedelta(days=0),
-                    )
-                )
-                current_range = [
-                    dates.pop(0),
-                ]
-
-            else:
-                ranges.append(
-                    DateRange(
-                        start=current_range[0],
-                        end=current_range[-1],
-                        step=timedelta(days=1),
-                    )
-                )
-                current_range = [
-                    dates.pop(0),
-                ]
-        return ranges
-
-    def __contains__(self, value: Any) -> bool:
-        v = datetime.strptime(value, "%Y%m%d").date()
-        return self.start <= v <= self.end and (v - self.start) % self.step == 0
-
-    def __len__(self) -> int:
-        return (self.end - self.start) // self.step
-
-    def summary(self) -> str:
-        def fmt(d):
-            return d.strftime("%Y%m%d")
-
-        if self.step == timedelta(days=0):
-            return f"{fmt(self.start)}"
-        if self.step == timedelta(days=1):
-            return f"{fmt(self.start)}/to/{fmt(self.end)}"
-
-        return (
-            f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step // timedelta(days=1)}"
-        )
-
-
-@dataclass(frozen=True)
-class TimeRange(Range):
-    start: int
-    end: int
-    step: int
-    dtype: Literal["time"] = dataclasses.field(kw_only=True, default="time")
-
-    @classmethod
-    def from_strings(self, values: list[str]) -> list["TimeRange"]:
-        if len(values) == 0:
-            return []
-
-        times = sorted([int(v) for v in values])
-        if len(times) < 2:
-            return [TimeRange(start=times[0], end=times[0], step=100)]
-
-        ranges = []
-        current_range, times = (
-            [
-                times[0],
-            ],
-            times[1:],
-        )
-        while len(times) > 1:
-            if times[0] - current_range[-1] == 1:
-                current_range.append(times.pop(0))
-
-            elif len(current_range) == 1:
-                ranges.append(
-                    TimeRange(start=current_range[0], end=current_range[0], step=0)
-                )
-                current_range = [
-                    times.pop(0),
-                ]
-
-            else:
-                ranges.append(
-                    TimeRange(start=current_range[0], end=current_range[-1], step=1)
-                )
-                current_range = [
-                    times.pop(0),
-                ]
-        return ranges
-
-    def __len__(self) -> int:
-        return (self.end - self.start) // self.step
-
-    def summary(self) -> str:
-        def fmt(d):
-            return f"{d:04d}"
-
-        if self.step == 0:
-            return f"{fmt(self.start)}"
-        return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
-
-    def __contains__(self, value: Any) -> bool:
-        v = int(value)
-        return self.start <= v <= self.end and (v - self.start) % self.step == 0
-
-
-@dataclass(frozen=True)
-class IntRange(Range):
-    dtype: Literal["int"]
-    start: int
-    end: int
-    step: int
-    dtype: Literal["int"] = dataclasses.field(kw_only=True, default="int")
-
-    def __len__(self) -> int:
-        return (self.end - self.start) // self.step
-
-    def summary(self) -> str:
-        def fmt(d):
-            return d.strftime("%Y%m%d")
-
-        return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
-
-    def __contains__(self, value: Any) -> bool:
-        v = int(value)
-        return self.start <= v <= self.end and (v - self.start) % self.step == 0
-
-
-def values_from_json(obj) -> Values:
-    if isinstance(obj, list):
-        return Enum(obj)
-
-    match obj["dtype"]:
-        case "date":
-            return DateRange(**obj)
-        case "time":
-            return TimeRange(**obj)
-        case "int":
-            return IntRange(**obj)
-        case _:
-            raise ValueError(f"Unknown dtype {obj['dtype']}")
-
-
-@dataclass(frozen=True)
-class Node:
-    key: str
-    values: Values  # Must support len()
-    metadata: dict[str, str]  # Applies to all children
-    payload: list[Any]  # List of size product(len(n.values) for n in  ancestors(self))
-    children: list["Node"]
-
-
-def summarize_node(node: Node) -> tuple[str, Node]:
-    """
-    Extracts a summarized representation of the node while collapsing single-child paths.
-    Returns the summary string and the last node in the chain that has multiple children.
-    """
-    summary = []
-
-    while True:
-        values_summary = node.values.summary()
-        if len(values_summary) > 50:
-            values_summary = values_summary[:50] + "..."
-        summary.append(f"{node.key}={values_summary}")
-
-        # Move down if there's exactly one child, otherwise stop
-        if len(node.children) != 1:
-            break
-        node = node.children[0]
-
-    return ", ".join(summary), node
-
-
-def node_tree_to_string(node: Node, prefix: str = "", depth=None) -> Iterable[str]:
-    summary, node = summarize_node(node)
-
-    if depth is not None and depth <= 0:
-        yield summary + " - ...\n"
-        return
-    # Special case for nodes with only a single child, this makes the printed representation more compact
-    elif len(node.children) == 1:
-        yield summary + ", "
-        yield from node_tree_to_string(node.children[0], prefix, depth=depth)
-        return
-    else:
-        yield summary + "\n"
-
-    for index, child in enumerate(node.children):
-        connector = "└── " if index == len(node.children) - 1 else "├── "
-        yield prefix + connector
-        extension = "    " if index == len(node.children) - 1 else "│   "
-        yield from node_tree_to_string(
-            child, prefix + extension, depth=depth - 1 if depth is not None else None
-        )
-
-
-def node_tree_to_html(
-    node: Node, prefix: str = "", depth=1, connector=""
-) -> Iterable[str]:
-    summary, node = summarize_node(node)
-
-    if len(node.children) == 0:
-        yield f'<span class="leaf">{connector}{summary}</span>'
-        return
-    else:
-        open = "open" if depth > 0 else ""
-        yield f"<details {open}><summary>{connector}{summary}</summary>"
-
-    for index, child in enumerate(node.children):
-        connector = "└── " if index == len(node.children) - 1 else "├── "
-        extension = "    " if index == len(node.children) - 1 else "│   "
-        yield from node_tree_to_html(
-            child, prefix + extension, depth=depth - 1, connector=prefix + connector
-        )
-    yield "</details>"
-
-
-@dataclass(frozen=True)
-class CompressedTree:
-    root: Node
-
-    @classmethod
-    def from_json(cls, json: dict) -> "CompressedTree":
-        def from_json(json: dict) -> Node:
-            return Node(
-                key=json["key"],
-                values=values_from_json(json["values"]),
-                metadata=json["metadata"] if "metadata" in json else {},
-                payload=json["payload"] if "payload" in json else [],
-                children=[from_json(c) for c in json["children"]],
-            )
-
-        return CompressedTree(root=from_json(json))
-
-    def __str__(self):
-        return "".join(node_tree_to_string(node=self.root))
-
-    def html(self, depth=2) -> HTML:
-        return HTML(self._repr_html_(depth=depth))
-
-    def _repr_html_(self, depth=2):
-        css = """
-        <style>
-        .qubed-tree-view {
-            font-family: monospace;
-            white-space: pre;
-        }
-        .qubed-tree-view details {
-            # display: inline;
-            margin-left: 0;
-        }
-        .qubed-tree-view summary {
-            list-style: none;
-            cursor: pointer;
-            text-overflow: ellipsis;
-            overflow: hidden;
-            text-wrap: nowrap;
-            display: block;
-        }
-
-        .qubed-tree-view .leaf {
-            text-overflow: ellipsis;
-            overflow: hidden;
-            text-wrap: nowrap;
-            display: block;
-        }
-
-        .qubed-tree-view summary:hover,span.leaf:hover {
-            background-color: #f0f0f0;
-        }
-        .qubed-tree-view details > summary::after {
-            content: ' ';
-        }
-        .qubed-tree-view details:not([open]) > summary::after {
-            content: " ▼";
-        }
-        </style>
-
-        """
-        nodes = "".join(
-            cc
-            for c in self.root.children
-            for cc in node_tree_to_html(node=c, depth=depth)
-        )
-        return f"{css}<pre class='qubed-tree-view'>{nodes}</pre>"
-
-    def print(self, depth=None):
-        print(
-            "".join(
-                cc
-                for c in self.root.children
-                for cc in node_tree_to_string(node=c, depth=depth)
-            )
-        )
-
-    def transform(self, func: Callable[[Node], Node]) -> "CompressedTree":
-        "Call a function on every node of the tree, any changes to the children of a node will be ignored."
-
-        def transform(node: Node) -> Node:
-            new_node = func(node)
-            return dataclasses.replace(
-                new_node, children=[transform(c) for c in node.children]
-            )
-
-        return CompressedTree(root=transform(self.root))
-
-    def guess_datatypes(self) -> "CompressedTree":
-        def guess_datatypes(node: Node) -> list[Node]:
-            # Try to convert enum values into more structured types
-            children = [cc for c in node.children for cc in guess_datatypes(c)]
-
-            if isinstance(node.values, Enum):
-                match node.key:
-                    case "time":
-                        range_class = TimeRange
-                    case "date":
-                        range_class = DateRange
-                    case _:
-                        range_class = None
-
-                if range_class is not None:
-                    return [
-                        dataclasses.replace(node, values=range, children=children)
-                        for range in range_class.from_strings(node.values.values)
-                    ]
-            return [dataclasses.replace(node, children=children)]
-
-        children = [cc for c in self.root.children for cc in guess_datatypes(c)]
-        return CompressedTree(root=dataclasses.replace(self.root, children=children))
-
-    def select(
-        self,
-        selection: dict[str, str | list[str]],
-        mode: Literal["strict", "relaxed"] = "relaxed",
-    ) -> "CompressedTree":
-        # make all values lists
-        selection = {k: v if isinstance(v, list) else [v] for k, v in selection.items()}
-
-        def not_none(xs):
-            return [x for x in xs if x is not None]
-
-        def select(node: Node) -> Node | None:
-            # Check if the key is specified in the selection
-            if node.key not in selection:
-                if mode == "strict":
-                    return None
-                return dataclasses.replace(
-                    node, children=not_none(select(c) for c in node.children)
-                )
-
-            # If the key is specified, check if any of the values match
-            values = Enum([c for c in selection[node.key] if c in node.values])
-
-            if not values:
-                return None
-
-            return dataclasses.replace(
-                node, values=values, children=not_none(select(c) for c in node.children)
-            )
-
-        return CompressedTree(
-            root=dataclasses.replace(
-                self.root, children=not_none(select(c) for c in self.root.children)
-            )
-        )
-
-    def to_list_of_cubes(self):
-        def to_list_of_cubes(node: Node) -> list[list[Node]]:
-            return [
-                [node] + sub_cube
-                for c in node.children
-                for sub_cube in to_list_of_cubes(c)
-            ]
-
-        return to_list_of_cubes(self.root)
-
-    def info(self):
-        cubes = self.to_list_of_cubes()
-        print(f"Number of distinct paths: {len(cubes)}")
-
-
-# What should the interace look like?
-
-# tree = CompressedTree.from_json(...)
-# tree = CompressedTree.from_protobuf(...)
-
-# tree.print(depth = 5) # Prints a nice tree representation