updates

2025-02-10 15:26:25 +00:00 · 2025-02-10 15:26:25 +00:00 · 609e3e9f74
commit 609e3e9f74
parent 81a478a58f
9 changed files with 888 additions and 13 deletions
--- a/tree_compresser/Cargo.lock
+++ b/tree_compresser/Cargo.lock
@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4

 [[package]]
 name = "aho-corasick"
@ -359,7 +359,6 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
 [[package]]
 name = "rsfdb"
 version = "0.1.0"
-source = "git+https://github.com/ecmwf/rsfdb?branch=develop#ab8c9590bba15d22167c274db9238cd9b897baf1"
 dependencies = [
 "libc",
 "libloading",
@ -372,7 +371,6 @@ dependencies = [
 [[package]]
 name = "rsfindlibs"
 version = "0.1.1"
-source = "git+https://github.com/ecmwf-projects/rsfindlibs.git#1358b1049bf3e0b581badfc8005a9828a542cdaa"
 dependencies = [
 "cc",
 "clap",
--- a/tree_compresser/Cargo.toml
+++ b/tree_compresser/Cargo.toml
@ -1,7 +1,8 @@
 [package]
-name = "qubed_tree"
-version = "0.1.0"
+name = "qubed"
+version = "0.1.2"
 edition = "2021"
+repository = "https://github.com/ecmwf/qubed"

 [dependencies]
 rsfdb = {git = "https://github.com/ecmwf/rsfdb", branch = "develop"}
@ -16,7 +17,7 @@ crate-type = ["cdylib"]
 path = "./rust_src/lib.rs"

 [patch.'https://github.com/ecmwf/rsfdb']
-rsfdb = { path = "../rsfdb" }
+rsfdb = { path = "../../rsfdb" }

 [patch.'https://github.com/ecmwf-projects/rsfindlibs']
-rsfindlibs = { path = "../rsfindlibs" }
+rsfindlibs = { path = "../../rsfindlibs" }
--- a/tree_compresser/python_src/tree_traverser/CompressedDataCubeTree.py
+++ b/tree_compresser/python_src/tree_traverser/CompressedDataCubeTree.py
@ -0,0 +1,216 @@
+import dataclasses
+from collections import defaultdict
+from dataclasses import dataclass, field
+
+from frozendict import frozendict
+
+from .DataCubeTree import Enum, NodeData, Tree
+from .tree_formatters import HTML, node_tree_to_html, node_tree_to_string
+
+NodeId = int
+CacheType = dict[NodeId, "CompressedNode"]
+
+@dataclass(frozen=True)
+class CompressedNode:
+    id: NodeId = field(hash=False, compare=False)
+    data: NodeData
+
+    _children: tuple[NodeId, ...]
+    _cache: CacheType = field(repr=False, hash=False, compare=False)
+
+    @property
+    def children(self) -> tuple["CompressedNode", ...]:
+        return tuple(self._cache[i] for i in self._children)
+
+    def summary(self, debug = False) -> str:
+        if debug:  return f"{self.data.key}={self.data.values.summary()} ({self.id})"
+        return f"{self.data.key}={self.data.values.summary()}" if self.data.key != "root" else "root"
+
+
+@dataclass(frozen=True)
+class CompressedTree:
+    """
+    This tree is compressed in two distinct different ways:
+    1. Product Compression: Nodes have a key and **multiple values**, so each node represents many logical nodes key=value1, key=value2, ...
+       Each of these logical nodes is has identical children so we can compress them like this.
+       In this way any distinct path through the tree represents a cartesian product of the values, otherwise known as a datacube.
+
+    2. In order to facilitate the product compression described above we need to know when two nodes have identical children.
+       To do this every node is assigned an Id which is initially computed as a hash from the nodes data and its childrens' ids.
+       In order to avoid hash collisions we increment the initial hash if it's already in the cache for a different node 
+       we do this until we find a unique id.
+
+       Crucially this allows us to later determine if a new node is already cached: 
+        id = hash(node)
+        while True:
+            if id not in cache: The node is definitely not in the cache
+            elif cache[id] != node: Hash collision, increment id and try again
+            else: The node is already in the cache
+            id += 1
+
+    This tree can be walked from the root by repeatedly looking up the children of a node in the cache.
+
+    This structure facilitates compression because we can look at the children of a node:
+        If two chidren have the same key, metadata and children then we can compress them into a single node.
+
+"""
+    root: CompressedNode
+    cache: CacheType
+
+    @staticmethod
+    def add_to_cache(cache : dict[NodeId, CompressedNode], data : NodeData, _children: tuple[NodeId, ...]) -> NodeId:
+        """
+        This function is responsible for adding a new node to the cache and returning its id.
+        Crucially we need a way to check if new nodes are already in the cache, so we hash them.
+        But in case of a hash collision we need to increment the id and try again.
+        This way we will always eventually find a unique id for the node.
+        And we will never store the same node twice with a different id.
+        """
+        _children = tuple(sorted(_children))
+        id = hash((data, _children))
+
+        # To avoid hash collisions, we increment the id until we find a unique one
+        tries = 0
+        while True:
+            tries += 1
+            if id not in cache:
+                # The node isn't in the cache and this id is free
+                cache[id] = CompressedNode(id = id,
+                                           data = data,
+                                           _children = _children,
+                                           _cache = cache)
+                break 
+            
+            if cache[id].data == data and cache[id]._children == _children:
+                break # The node is already in the cache
+
+            # This id is already in use by a different node so increment it (mod) and try again
+            id = (id + 1) % (2**64)
+
+            if tries > 100:
+                raise RuntimeError("Too many hash collisions, something is wrong.")
+        
+        return id
+
+
+    @classmethod
+    def from_tree(cls, tree : Tree) -> 'CompressedTree':
+        cache = {}
+
+        def cache_tree(level : Tree) -> NodeId:
+            node_data = NodeData(
+                key = level.key,
+                values = level.values,
+            )
+
+            # Recursively cache the children
+            children = tuple(cache_tree(c) for c in level.children)
+            
+            # Add the node to the cache and return its id
+            return cls.add_to_cache(cache, node_data, children)
+        
+        root = cache_tree(tree)
+        return cls(cache = cache, root = cache[root])
+    
+    def __str__(self):
+        return "".join(node_tree_to_string(self.root))
+    
+    def html(self, depth = 2, debug = False) -> HTML:
+        return HTML(node_tree_to_html(self.root, depth = depth, debug = debug))
+    
+    def _repr_html_(self) -> str:
+        return node_tree_to_html(self.root, depth = 2)
+    
+    def __getitem__(self, args) -> 'CompressedTree':
+        key, value = args
+        for c in self.root.children:
+            if c.data.key == key and value in c.data.values:
+                data = dataclasses.replace(c.data, values = Enum((value,)))
+                return CompressedTree(
+                    cache = self.cache,
+                    root = dataclasses.replace(c, data = data)
+                )
+        raise KeyError(f"Key {key} not found in children.")
+    
+    def collapse_children(self, node: "CompressedNode") -> "CompressedNode":
+        # First perform the collapse on the children
+        new_children = [self.collapse_children(child) for child in node.children]
+
+        # Now take the set of new children and see if any have identical key, metadata and children
+        # the values may different and will be collapsed into a single node
+        identical_children = defaultdict(set)
+        for child in new_children:
+            identical_children[(child.data.key, child.data.metadata, child._children)].add(child)
+        
+        # Now go through and create new compressed nodes for any groups that need collapsing
+        new_children = []
+        for (key, metadata, _children), child_set in identical_children.items():
+            if len(child_set) > 1:
+                # Compress the children into a single node
+                assert all(isinstance(child.data.values, Enum) for child in child_set), "All children must have Enum values"
+                node_data = NodeData(
+                    key = key,
+                    metadata = frozendict(), # Todo: Implement metadata compression
+                    values = Enum(tuple(v for child in child_set for v in child.data.values.values)),
+                )
+                
+                # Add the node to the cache
+                id = type(self).add_to_cache(self.cache, node_data, _children)
+            else:
+                # If the group is size one just keep it
+                id = child_set.pop().id
+            
+            new_children.append(id)
+
+        id = self.add_to_cache(self.cache, node.data, tuple(sorted(new_children)))
+        return self.cache[id]
+
+
+    def compress(self) -> 'CompressedTree':
+        return CompressedTree(cache = self.cache, root = self.collapse_children(self.root))
+
+    def lookup(self, selection : dict[str, str]):
+        nodes = [self.root]
+        for _ in range(1000):
+            found = False
+            current_node = nodes[-1]
+            for c in current_node.children:
+                if selection.get(c.data.key, None) in c.data.values:
+                    if found: 
+                        raise RuntimeError("This tree is invalid, because it contains overlapping branches.")
+                    nodes.append(c)
+                    selection.pop(c.data.key)
+                    found = True
+            
+            if not found:
+                return nodes
+            
+        raise RuntimeError("Maximum node searches exceeded, the tree contains a loop or something is buggy.")
+
+
+
+    
+    # def reconstruct(self) -> Tree:
+    #     def reconstruct_node(h : int) -> Tree:
+    #         node = self.cache[h]
+    #         dedup : dict[tuple[int, str], set[NodeId]] = defaultdict(set)
+    #         for index in self.cache[h].children:
+    #             child_node = self.cache[index]
+    #             child_hash = hash(child_node.children)
+    #             assert isinstance(child_node.values, Enum)
+    #             dedup[(child_hash, child_node.key)].add(index)
+
+        
+    #         children = tuple(
+    #             Tree(key = key, values = Enum(tuple(values)), 
+    #             children = tuple(reconstruct_node(i) for i in self.cache[next(indices)].children)
+    #             )
+    #             for (_, key), indices in dedup.items()
+    #         )
+
+    #         return Tree(
+    #             key = node.key,
+    #             values = node.values,
+    #             children = children,
+    #         )
+    #     return reconstruct_node(self.root)
--- a/tree_compresser/python_src/tree_traverser/CompressedTree.py
+++ b/tree_compresser/python_src/tree_traverser/CompressedTree.py
@ -1,5 +1,6 @@
 import json
 from collections import defaultdict
+from dataclasses import asdict, dataclass
 from pathlib import Path

 Tree = dict[str, "Tree"]
@ -13,6 +14,11 @@ class RefcountedDict(dict[str, int]):
    def __hash__(self):
        return hash(tuple(sorted(self.items())))

+@dataclass
+class JSONNode:
+    key: str
+    values: list[str]
+    children: list["JSONNode"]

 class CompressedTree():
    """
@ -101,6 +107,23 @@ class CompressedTree():
            return {f"{key}={','.join(values)}" : reconstruct_node(h, depth=depth+1) for (h, key), values in dedup.items()}
        return reconstruct_node(from_node or self.root_hash, depth=0)
    
+    def to_json(self, max_depth=None, from_node=None) -> dict:
+        def reconstruct_node(h : int, depth : int) -> list[JSONNode]:
+            if max_depth is not None and depth > max_depth: 
+                return {}
+            dedup : dict[tuple[int, str], set[str]] = defaultdict(set)
+            for k, h2 in self.cache[h].items():
+                key, value = k.split("=")
+                dedup[(h2, key)].add(value)
+
+            return [JSONNode(
+                key = key,
+                values = list(values),
+                children = reconstruct_node(h, depth=depth+1),
+            ) for (h, key), values in dedup.items()]
+        
+        return asdict(reconstruct_node(from_node or self.root_hash, depth=0)[0])
+    
    def __init__(self, tree : Tree):
        self.cache = {}
        self.empty_hash = hash(RefcountedDict({}))
@ -139,8 +162,8 @@ class CompressedTree():
        return list(loc.keys())
    
    def multi_match(self, request : dict[str, list[str]], loc = None):
-        if not loc: return {"_END_" : {}}
        if loc is None: loc = self.tree
+        if loc == {}: return {"_END_" : {}}
        matches = {}
        for request_key, request_values in request.items():
            for request_value in request_values:
--- a/tree_compresser/python_src/tree_traverser/DataCubeTree.py
+++ b/tree_compresser/python_src/tree_traverser/DataCubeTree.py
@ -0,0 +1,267 @@
+import dataclasses
+from dataclasses import dataclass, field
+from typing import Any, Callable, Hashable, Literal, Mapping
+
+from frozendict import frozendict
+
+from .tree_formatters import HTML, node_tree_to_html, node_tree_to_string
+from .value_types import DateRange, Enum, IntRange, TimeRange, Values
+
+
+def values_from_json(obj) -> Values:
+    if isinstance(obj, list): 
+        return Enum(tuple(obj))
+
+    match obj["dtype"]:
+        case "date": return DateRange(**obj)
+        case "time": return TimeRange(**obj)
+        case "int": return IntRange(**obj)
+        case _: raise ValueError(f"Unknown dtype {obj['dtype']}")
+
+# In practice use a frozendict
+Metadata = Mapping[str, str | int | float | bool]
+
+@dataclass(frozen=True, eq=True, order=True)
+class NodeData:
+    key: str
+    values: Values
+    metadata: dict[str, tuple[Hashable, ...]] = field(default_factory=frozendict, compare=False)
+
+    def summary(self) -> str:
+        return f"{self.key}={self.values.summary()}" if self.key != "root" else "root"
+
+@dataclass(frozen=True, eq=True, order=True)
+class Tree:
+    data: NodeData
+    children: tuple['Tree', ...]
+
+    @property
+    def key(self) -> str:
+        return self.data.key
+    
+    @property
+    def values(self) -> Values:
+        return self.data.values
+    
+    @property
+    def metadata(self) -> frozendict[str, Any]:
+        return self.data.metadata
+
+    
+    def summary(self) -> str:
+        return self.data.summary()
+    
+    @classmethod
+    def make(cls, key : str, values : Values, children, **kwargs) -> 'Tree':
+        return cls(
+            data = NodeData(key, values,  metadata = kwargs.get("metadata", frozendict())
+            ),
+            children = tuple(sorted(children)),
+        )
+
+
+    @classmethod
+    def from_json(cls, json: dict) -> 'Tree':
+        def from_json(json: dict) -> Tree:
+            return Tree.make(
+                key=json["key"],
+                values=values_from_json(json["values"]),
+                metadata=json["metadata"] if "metadata" in json else {},
+                children=tuple(from_json(c) for c in json["children"])
+            )
+        return from_json(json)
+    
+    @classmethod
+    def from_dict(cls, d: dict) -> 'Tree':
+        def from_dict(d: dict) -> tuple[Tree, ...]:
+            return tuple(Tree.make(
+                key=k.split("=")[0],
+                values=Enum(tuple(k.split("=")[1].split("/"))),
+                children=from_dict(children)
+            ) for k, children in d.items())
+        
+        return Tree.make(key = "root",
+                              values=Enum(("root",)),
+                              children = from_dict(d))
+    
+    @classmethod
+    def empty(cls) -> 'Tree':
+        return cls.make("root", Enum(("root",)), [])
+
+    
+    def __str__(self):
+        return "".join(node_tree_to_string(node=self))
+    
+    def html(self, depth = 2, collapse = True) -> HTML:
+        return HTML(node_tree_to_html(self, depth = depth, collapse = collapse))
+    
+    def _repr_html_(self) -> str:
+        return node_tree_to_html(self, depth = 2, collapse = True)
+    
+    def __getitem__(self, args) -> 'Tree':
+        key, value = args
+        for c in self.children:
+            if c.key == key and value in c.values:
+                data = dataclasses.replace(c.data, values = Enum((value,)))
+                return dataclasses.replace(c, data = data)
+        raise KeyError(f"Key {key} not found in children of {self.key}")
+
+    
+    def print(self, depth = None):
+        print("".join(cc for c in self.children for cc in node_tree_to_string(node=c, depth = depth)))
+
+    def transform(self, func: 'Callable[[Tree], Tree | list[Tree]]') -> 'Tree':
+        """
+        Call a function on every node of the tree, return one or more nodes.
+        If multiple nodes are returned they each get a copy of the (transformed) children of the original node.
+        Any changes to the children of a node will be ignored.
+        """
+        def transform(node: Tree) -> list[Tree]:
+            children = [cc for c in node.children for cc in transform(c)]
+            new_nodes = func(node)
+            if isinstance(new_nodes, Tree):
+                new_nodes = [new_nodes]
+
+            return [dataclasses.replace(new_node, children = children)
+                    for new_node in new_nodes]
+        
+        children = tuple(cc for c in self.children for cc in transform(c))
+        return dataclasses.replace(self, children = children)
+
+    def guess_datatypes(self) -> 'Tree':
+        def guess_datatypes(node: Tree) -> list[Tree]:
+            # Try to convert enum values into more structured types
+            children = tuple(cc for c in node.children for cc in guess_datatypes(c))
+
+            if isinstance(node.values, Enum):
+                match node.key:
+                    case "time": range_class = TimeRange
+                    case "date": range_class = DateRange
+                    case _: range_class = None
+
+                if range_class is not None:
+                    return [
+                        dataclasses.replace(node, values = range, children = children)
+                        for range in range_class.from_strings(node.values.values)
+                    ]
+            return [dataclasses.replace(node, children = children)]
+
+        children = tuple(cc for c in self.children for cc in guess_datatypes(c))
+        return dataclasses.replace(self, children = children)
+
+    
+    def select(self, selection : dict[str, str | list[str]], mode: Literal["strict", "relaxed"] = "relaxed") -> 'Tree':
+        # make all values lists
+        selection = {k : v if isinstance(v, list) else [v] for k,v in selection.items()}
+
+        def not_none(xs): return tuple(x for x in xs if x is not None)
+
+        def select(node: Tree) -> Tree | None: 
+            # Check if the key is specified in the selection
+            if node.key not in selection: 
+                if mode == "strict":
+                    return None
+                return dataclasses.replace(node, children = not_none(select(c) for c in node.children))
+            
+            # If the key is specified, check if any of the values match
+            values = Enum(tuple(c for c in selection[node.key] if c in node.values))
+
+            if not values: 
+                return None 
+            
+            return dataclasses.replace(node, values = values, children = not_none(select(c) for c in node.children))
+            
+        return dataclasses.replace(self, children = not_none(select(c) for c in self.children))
+    
+
+    @staticmethod
+    def _insert(position: "Tree", identifier : list[tuple[str, list[str]]]):
+        """
+        This algorithm goes as follows:
+        We're at a particular node in the tree, and we have a list of key-values pairs that we want to insert.
+        We take the first key values pair
+        key, values = identifier.pop(0)
+
+        The general idea is to insert key, values into the current node and use recursion to handle the rest of the identifier.
+        
+        We have two sources of values with possible overlap. The values to insert and the values attached to the children of this node.
+        For each value coming from either source we put it in one of three categories:
+            1) Values that exist only in the already existing child. (Coming exclusively from position.children)
+            2) Values that exist in both a child and the new values.
+            3) Values that exist only in the new values.
+            
+
+        Thus we add the values to insert to a set, and loop over the children.
+        For each child we partition its values into the three categories.
+
+        For 1) we create a new child node with the key, reduced set of values and the same children.
+        For 2)
+            Create a new child node with the key, and the values in group 2
+            Recurse to compute the children
+
+        Once we have finished looping over children we know all the values left over came exclusively from the new values.
+        So we:
+            Create a new node with these values.
+            Recurse to compute the children
+
+        Finally we return the node with all these new children.
+        """
+        if not identifier:
+            return position
+
+        key, values = identifier.pop(0)
+        # print(f"Inserting {key}={values} into {position.summary()}")
+
+        # Determine which children have this key
+        possible_children = {c : [] for c in position.children if c.key == key}
+        entirely_new_values = []
+
+        # For each value check it is already in one of the children
+        for v in values:
+            for c in possible_children:
+                if v in c.values:
+                    possible_children[c].append(v)
+                    break
+            else: # only executed if the loop did not break
+                # If none of the children have this value, add it to the new child pile
+                entirely_new_values.append(v)
+
+        # d = {p.summary() : v for p, v in possible_children.items()}
+        # print(f"  {d} new_values={entirely_new_values}")
+
+        new_children = []
+        for c, affected in possible_children.items():
+            if not affected:
+                new_children.append(c)
+                continue
+
+            unaffected = [x for x in c.values if x not in affected]
+            if unaffected:
+                unaffected_node = Tree.make(c.key, Enum(tuple(unaffected)), c.children)
+                new_children.append(unaffected_node) # Add the unaffected part of this child
+
+            if affected: # This check is not technically necessary, but it makes the code more readable
+                new_node = Tree.make(key, Enum(tuple(affected)), [])
+                new_node = Tree._insert(new_node, identifier)
+                new_children.append(new_node) # Add the affected part of this child
+
+        # If there are any values not in any of the existing children, add them as a new child
+        if entirely_new_values:
+            new_node = Tree.make(key, Enum(tuple(entirely_new_values)), [])
+            new_children.append(Tree._insert(new_node, identifier))
+
+        return Tree.make(position.key, position.values, new_children)
+
+    def insert(self, identifier : dict[str, list[str]]) -> 'Tree':
+        insertion = [(k, v) for k, v in identifier.items()]
+        return Tree._insert(self, insertion)
+    
+    def to_list_of_cubes(self):
+        def to_list_of_cubes(node: Tree) -> list[list[Tree]]:
+            return [[node] + sub_cube for c in node.children for sub_cube in to_list_of_cubes(c)]
+
+        return to_list_of_cubes(self)
+
+    def info(self):
+        cubes = self.to_list_of_cubes()
+        print(f"Number of distinct paths: {len(cubes)}")
--- a/tree_compresser/python_src/tree_traverser/tree_formatters.py
+++ b/tree_compresser/python_src/tree_traverser/tree_formatters.py
@ -0,0 +1,116 @@
+from dataclasses import dataclass
+from typing import Iterable, Protocol, Sequence, runtime_checkable
+
+
+@runtime_checkable
+class TreeLike(Protocol):
+    @property
+    def children(self) -> Sequence["TreeLike"]: ...  # Supports indexing like node.children[i]
+    
+    def summary(self, **kwargs) -> str: ...
+
+@dataclass(frozen=True)
+class HTML():
+    html: str
+    def _repr_html_(self):
+        return self.html
+
+def summarize_node(node: TreeLike, collapse = False, **kwargs) -> tuple[str, TreeLike]:
+    """
+    Extracts a summarized representation of the node while collapsing single-child paths.
+    Returns the summary string and the last node in the chain that has multiple children.
+    """
+    summaries = []
+    
+    while True:
+        summary = node.summary(**kwargs)
+        if len(summary) > 50:
+            summary = summary[:50] + "..."
+        summaries.append(summary)
+        if not collapse: 
+            break
+
+        # Move down if there's exactly one child, otherwise stop
+        if len(node.children) != 1:
+            break
+        node = node.children[0]
+
+    return ", ".join(summaries), node
+
+def node_tree_to_string(node : TreeLike, prefix : str = "", depth = None) -> Iterable[str]:
+    summary, node = summarize_node(node)
+    
+    if depth is not None and depth <= 0:
+        yield summary + " - ...\n"
+        return
+    # Special case for nodes with only a single child, this makes the printed representation more compact
+    elif len(node.children) == 1:
+        yield summary + ", "
+        yield from node_tree_to_string(node.children[0], prefix, depth = depth)
+        return
+    else:
+        yield summary + "\n"
+    
+    for index, child in enumerate(node.children):
+        connector = "└── " if index == len(node.children) - 1 else "├── "
+        yield prefix + connector
+        extension = "    " if index == len(node.children) - 1 else "│   "
+        yield from node_tree_to_string(child, prefix + extension, depth = depth - 1 if depth is not None else None)
+
+def _node_tree_to_html(node : TreeLike, prefix : str = "", depth = 1, connector = "", **kwargs) -> Iterable[str]:
+    summary, node = summarize_node(node, **kwargs)
+    
+    if len(node.children) == 0:
+        yield f'<span class="leaf">{connector}{summary}</span>'
+        return
+    else:
+        open = "open" if depth > 0 else ""
+        yield f"<details {open}><summary>{connector}{summary}</summary>"
+
+    for index, child in enumerate(node.children):
+        connector = "└── " if index == len(node.children) - 1 else "├── "
+        extension = "    " if index == len(node.children) - 1 else "│   "
+        yield from _node_tree_to_html(child, prefix + extension, depth = depth - 1, connector = prefix+connector, **kwargs)
+    yield "</details>"
+
+def node_tree_to_html(node : TreeLike, depth = 1, **kwargs) -> str:
+        css = """
+        <style>
+        .qubed-tree-view {
+            font-family: monospace;
+            white-space: pre;
+        }
+        .qubed-tree-view details {
+            # display: inline;
+            margin-left: 0;
+        }
+        .qubed-tree-view summary {
+            list-style: none;
+            cursor: pointer;
+            text-overflow: ellipsis;
+            overflow: hidden;
+            text-wrap: nowrap;
+            display: block;
+        }
+
+        .qubed-tree-view .leaf {
+            text-overflow: ellipsis;
+            overflow: hidden;
+            text-wrap: nowrap;
+            display: block;
+        }
+
+        .qubed-tree-view summary:hover,span.leaf:hover {
+            background-color: #f0f0f0;
+        }
+        .qubed-tree-view details > summary::after {
+            content: ' ▲';
+        }
+        .qubed-tree-view details:not([open]) > summary::after {
+            content: " ▼";
+        }
+        </style>
+
+        """
+        nodes = "".join(_node_tree_to_html(node=node, depth=depth, **kwargs))
+        return f"{css}<pre class='qubed-tree-view'>{nodes}</pre>"
--- a/tree_compresser/python_src/tree_traverser/trie.py
+++ b/tree_compresser/python_src/tree_traverser/trie.py
@ -0,0 +1,40 @@
+from dataclasses import dataclass, field
+
+character = str
+
+@dataclass(unsafe_hash=True)
+class TrieNode():
+    parent: "TrieNode | None"
+    parent_char: character
+    children: dict[character, "TrieNode"] = field(default_factory=dict)
+
+
+@dataclass 
+class Trie:
+    root: TrieNode = field(default_factory=lambda: TrieNode(None, ""))
+    reverse_lookup: dict[int, TrieNode] = field(default_factory=dict)
+
+    def insert(self, word: str):
+        node = self.root
+        for char in word:
+            if char not in node.children:
+                new_node = TrieNode(node, char)
+                node.children[char] = new_node
+
+            node = node.children[char]
+
+        n_id = id(node)
+        if n_id not in self.reverse_lookup:
+            self.reverse_lookup[n_id] = node
+
+        return n_id
+    
+    def lookup_by_id(self, n_id: int):
+        leaf_node = self.reverse_lookup[n_id]
+        string = []
+        while leaf_node.parent is not None:
+            string.append(leaf_node.parent_char)
+            leaf_node = leaf_node.parent
+
+        return "".join(reversed(string))
+
--- a/tree_compresser/python_src/tree_traverser/value_types.py
+++ b/tree_compresser/python_src/tree_traverser/value_types.py
@ -0,0 +1,214 @@
+import dataclasses
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import date, datetime, timedelta
+from typing import Any, Iterable, Literal
+
+
+@dataclass(frozen=True)
+class Values(ABC):
+    @abstractmethod
+    def summary(self) -> str:
+        pass
+    @abstractmethod
+    def __len__(self) -> int:
+        pass
+
+    @abstractmethod
+    def __contains__(self, value: Any) -> bool:
+        pass
+
+    @abstractmethod
+    def from_strings(self, values: Iterable[str]) -> list['Values']:
+        pass
+
+@dataclass(frozen=True, order=True)
+class Enum(Values):
+    """
+    The simplest kind of key value is just a list of strings.
+    summary -> string1/string2/string....
+    """
+    values: tuple[Any, ...]
+
+    def __post_init__(self):
+        assert isinstance(self.values, tuple)
+
+    def __iter__(self):
+        return iter(self.values)
+
+    def __len__(self) -> int:
+        return len(self.values)
+    def summary(self) -> str:
+        return '/'.join(map(str, sorted(self.values)))
+    def __contains__(self, value: Any) -> bool:
+        return value in self.values
+    def from_strings(self, values: Iterable[str]) -> list['Values']:
+        return [Enum(tuple(values))]
+
+@dataclass(frozen=True)
+class Range(Values, ABC):
+    dtype: str = dataclasses.field(kw_only=True)
+
+@dataclass(frozen=True)
+class DateRange(Range):
+    start: date
+    end: date
+    step: timedelta
+    dtype: Literal["date"] = dataclasses.field(kw_only=True, default="date")
+
+    @classmethod
+    def from_strings(self, values: Iterable[str]) -> list['DateRange']:
+        dates = sorted([datetime.strptime(v, "%Y%m%d") for v in values])
+        if len(dates) < 2:
+            return [DateRange(
+                start=dates[0],
+                end=dates[0],
+                step=timedelta(days=0)
+            )]
+        
+        ranges = []
+        current_range, dates = [dates[0],], dates[1:]
+        while len(dates) > 1:
+            if dates[0] - current_range[-1] == timedelta(days=1):
+                current_range.append(dates.pop(0))
+            
+            elif len(current_range) == 1:
+                ranges.append(DateRange(
+                start=current_range[0],
+                end=current_range[0],
+                step=timedelta(days=0)
+                ))
+                current_range = [dates.pop(0),]
+
+            else:
+                ranges.append(DateRange(
+                start=current_range[0],
+                end=current_range[-1],
+                step=timedelta(days=1)
+                ))
+                current_range = [dates.pop(0),]
+        return ranges
+    
+    def __contains__(self, value: Any) -> bool:
+        v = datetime.strptime(value, "%Y%m%d").date()
+        return self.start <= v <= self.end and (v - self.start) % self.step == 0
+
+
+    def __len__(self) -> int:
+        return (self.end - self.start) // self.step
+    
+    def summary(self) -> str:
+        def fmt(d): return d.strftime("%Y%m%d")
+        if self.step == timedelta(days=0):
+            return f"{fmt(self.start)}"
+        if self.step == timedelta(days=1):
+            return f"{fmt(self.start)}/to/{fmt(self.end)}"
+        
+        return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step // timedelta(days=1)}"
+
+@dataclass(frozen=True)
+class TimeRange(Range):
+    start: int
+    end: int
+    step: int
+    dtype: Literal["time"] = dataclasses.field(kw_only=True, default="time")
+
+    @classmethod
+    def from_strings(self, values: Iterable[str]) -> list['TimeRange']:
+        if len(values) == 0: return []
+
+        times = sorted([int(v) for v in values])
+        if len(times) < 2:
+            return [TimeRange(
+                start=times[0],
+                end=times[0],
+                step=100
+            )]
+        
+        ranges = []
+        current_range, times = [times[0],], times[1:]
+        while len(times) > 1:
+            if times[0] - current_range[-1] == 1:
+                current_range.append(times.pop(0))
+            
+            elif len(current_range) == 1:
+                ranges.append(TimeRange(
+                start=current_range[0],
+                end=current_range[0],
+                step=0
+                ))
+                current_range = [times.pop(0),]
+
+            else:
+                ranges.append(TimeRange(
+                start=current_range[0],
+                end=current_range[-1],
+                step=1
+                ))
+                current_range = [times.pop(0),]
+        return ranges
+
+    def __len__(self) -> int:
+        return (self.end - self.start) // self.step
+    
+    def summary(self) -> str:
+        def fmt(d): return f"{d:04d}"
+        if self.step == 0:
+            return f"{fmt(self.start)}"
+        return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
+    
+    def __contains__(self, value: Any) -> bool:
+        v = int(value)
+        return self.start <= v <= self.end and (v - self.start) % self.step == 0
+
+@dataclass(frozen=True)
+class IntRange(Range):
+    start: int
+    end: int
+    step: int
+    dtype: Literal["int"] = dataclasses.field(kw_only=True, default="int")
+
+    def __len__(self) -> int:
+        return (self.end - self.start) // self.step
+    
+    def summary(self) -> str:
+        def fmt(d): return d.strftime("%Y%m%d")
+        return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
+    
+    def __contains__(self, value: Any) -> bool:
+        v = int(value)
+        return self.start <= v <= self.end and (v - self.start) % self.step == 0
+    
+    @classmethod
+    def from_strings(self, values: Iterable[str]) -> list['IntRange']:
+        if len(values) == 0: return []
+        ints = sorted([int(v) for v in values])
+        if len(ints) < 2:
+            return [IntRange(
+                start=ints[0],
+                end=ints[0],
+                step=0
+            )]
+        
+        ranges = []
+        current_range, ints = [ints[0],], ints[1:]
+        while len(ints) > 1:
+            if ints[0] - current_range[-1] == 1:
+                current_range.append(ints.pop(0))
+            
+            elif len(current_range) == 1:
+                ranges.append(IntRange(
+                start=current_range[0],
+                end=current_range[0],
+                step=0
+                ))
+                current_range = [ints.pop(0),]
+
+            else:
+                ranges.append(IntRange(
+                start=current_range[0],
+                end=current_range[-1],
+                step=1
+                ))
+                current_range = [ints.pop(0),]
+        return ranges
--- a/tree_compresser/tests/open_climate_dt.py
+++ b/tree_compresser/tests/open_climate_dt.py
@ -3,15 +3,15 @@ from pathlib import Path

 from tree_traverser import CompressedTree

-data_path = Path("/home/eouser/qubed/config/climate-dt/compressed_tree.json")
+data_path = Path("./config/climate-dt/compressed_tree.json")
 # Print size of file
 print(f"climate dt compressed tree: {data_path.stat().st_size // 1e6:.1f} MB")

 print("Opening json file")
 compressed_tree = CompressedTree.load(data_path)

-print(compressed_tree.reconstruct_compressed_ecmwf_style())
+print(compressed_tree.to_json())

-# print("Outputting compressed tree ecmwf style")
-# with open("data/compressed_tree_climate_dt_ecmwf_style.json", "w") as f:
-#     json.dump(compressed_tree.reconstruct_compressed_ecmwf_style(), f)
+print("Outputting compressed tree ecmwf style")
+with open("config/climate-dt/new_format.json", "w") as f:
+    json.dump(compressed_tree.to_json(), f)