updates
This commit is contained in:
parent
81a478a58f
commit
609e3e9f74
4
tree_compresser/Cargo.lock
generated
4
tree_compresser/Cargo.lock
generated
@ -1,6 +1,6 @@
|
|||||||
# This file is automatically @generated by Cargo.
|
# This file is automatically @generated by Cargo.
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 3
|
version = 4
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
@ -359,7 +359,6 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "rsfdb"
|
name = "rsfdb"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/ecmwf/rsfdb?branch=develop#ab8c9590bba15d22167c274db9238cd9b897baf1"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"libloading",
|
"libloading",
|
||||||
@ -372,7 +371,6 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "rsfindlibs"
|
name = "rsfindlibs"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
source = "git+https://github.com/ecmwf-projects/rsfindlibs.git#1358b1049bf3e0b581badfc8005a9828a542cdaa"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
"clap",
|
"clap",
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "qubed_tree"
|
name = "qubed"
|
||||||
version = "0.1.0"
|
version = "0.1.2"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
repository = "https://github.com/ecmwf/qubed"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
rsfdb = {git = "https://github.com/ecmwf/rsfdb", branch = "develop"}
|
rsfdb = {git = "https://github.com/ecmwf/rsfdb", branch = "develop"}
|
||||||
@ -16,7 +17,7 @@ crate-type = ["cdylib"]
|
|||||||
path = "./rust_src/lib.rs"
|
path = "./rust_src/lib.rs"
|
||||||
|
|
||||||
[patch.'https://github.com/ecmwf/rsfdb']
|
[patch.'https://github.com/ecmwf/rsfdb']
|
||||||
rsfdb = { path = "../rsfdb" }
|
rsfdb = { path = "../../rsfdb" }
|
||||||
|
|
||||||
[patch.'https://github.com/ecmwf-projects/rsfindlibs']
|
[patch.'https://github.com/ecmwf-projects/rsfindlibs']
|
||||||
rsfindlibs = { path = "../rsfindlibs" }
|
rsfindlibs = { path = "../../rsfindlibs" }
|
@ -0,0 +1,216 @@
|
|||||||
|
import dataclasses
|
||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from frozendict import frozendict
|
||||||
|
|
||||||
|
from .DataCubeTree import Enum, NodeData, Tree
|
||||||
|
from .tree_formatters import HTML, node_tree_to_html, node_tree_to_string
|
||||||
|
|
||||||
|
NodeId = int
|
||||||
|
CacheType = dict[NodeId, "CompressedNode"]
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class CompressedNode:
|
||||||
|
id: NodeId = field(hash=False, compare=False)
|
||||||
|
data: NodeData
|
||||||
|
|
||||||
|
_children: tuple[NodeId, ...]
|
||||||
|
_cache: CacheType = field(repr=False, hash=False, compare=False)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def children(self) -> tuple["CompressedNode", ...]:
|
||||||
|
return tuple(self._cache[i] for i in self._children)
|
||||||
|
|
||||||
|
def summary(self, debug = False) -> str:
|
||||||
|
if debug: return f"{self.data.key}={self.data.values.summary()} ({self.id})"
|
||||||
|
return f"{self.data.key}={self.data.values.summary()}" if self.data.key != "root" else "root"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class CompressedTree:
|
||||||
|
"""
|
||||||
|
This tree is compressed in two distinct different ways:
|
||||||
|
1. Product Compression: Nodes have a key and **multiple values**, so each node represents many logical nodes key=value1, key=value2, ...
|
||||||
|
Each of these logical nodes is has identical children so we can compress them like this.
|
||||||
|
In this way any distinct path through the tree represents a cartesian product of the values, otherwise known as a datacube.
|
||||||
|
|
||||||
|
2. In order to facilitate the product compression described above we need to know when two nodes have identical children.
|
||||||
|
To do this every node is assigned an Id which is initially computed as a hash from the nodes data and its childrens' ids.
|
||||||
|
In order to avoid hash collisions we increment the initial hash if it's already in the cache for a different node
|
||||||
|
we do this until we find a unique id.
|
||||||
|
|
||||||
|
Crucially this allows us to later determine if a new node is already cached:
|
||||||
|
id = hash(node)
|
||||||
|
while True:
|
||||||
|
if id not in cache: The node is definitely not in the cache
|
||||||
|
elif cache[id] != node: Hash collision, increment id and try again
|
||||||
|
else: The node is already in the cache
|
||||||
|
id += 1
|
||||||
|
|
||||||
|
This tree can be walked from the root by repeatedly looking up the children of a node in the cache.
|
||||||
|
|
||||||
|
This structure facilitates compression because we can look at the children of a node:
|
||||||
|
If two chidren have the same key, metadata and children then we can compress them into a single node.
|
||||||
|
|
||||||
|
"""
|
||||||
|
root: CompressedNode
|
||||||
|
cache: CacheType
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def add_to_cache(cache : dict[NodeId, CompressedNode], data : NodeData, _children: tuple[NodeId, ...]) -> NodeId:
|
||||||
|
"""
|
||||||
|
This function is responsible for adding a new node to the cache and returning its id.
|
||||||
|
Crucially we need a way to check if new nodes are already in the cache, so we hash them.
|
||||||
|
But in case of a hash collision we need to increment the id and try again.
|
||||||
|
This way we will always eventually find a unique id for the node.
|
||||||
|
And we will never store the same node twice with a different id.
|
||||||
|
"""
|
||||||
|
_children = tuple(sorted(_children))
|
||||||
|
id = hash((data, _children))
|
||||||
|
|
||||||
|
# To avoid hash collisions, we increment the id until we find a unique one
|
||||||
|
tries = 0
|
||||||
|
while True:
|
||||||
|
tries += 1
|
||||||
|
if id not in cache:
|
||||||
|
# The node isn't in the cache and this id is free
|
||||||
|
cache[id] = CompressedNode(id = id,
|
||||||
|
data = data,
|
||||||
|
_children = _children,
|
||||||
|
_cache = cache)
|
||||||
|
break
|
||||||
|
|
||||||
|
if cache[id].data == data and cache[id]._children == _children:
|
||||||
|
break # The node is already in the cache
|
||||||
|
|
||||||
|
# This id is already in use by a different node so increment it (mod) and try again
|
||||||
|
id = (id + 1) % (2**64)
|
||||||
|
|
||||||
|
if tries > 100:
|
||||||
|
raise RuntimeError("Too many hash collisions, something is wrong.")
|
||||||
|
|
||||||
|
return id
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_tree(cls, tree : Tree) -> 'CompressedTree':
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
def cache_tree(level : Tree) -> NodeId:
|
||||||
|
node_data = NodeData(
|
||||||
|
key = level.key,
|
||||||
|
values = level.values,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Recursively cache the children
|
||||||
|
children = tuple(cache_tree(c) for c in level.children)
|
||||||
|
|
||||||
|
# Add the node to the cache and return its id
|
||||||
|
return cls.add_to_cache(cache, node_data, children)
|
||||||
|
|
||||||
|
root = cache_tree(tree)
|
||||||
|
return cls(cache = cache, root = cache[root])
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "".join(node_tree_to_string(self.root))
|
||||||
|
|
||||||
|
def html(self, depth = 2, debug = False) -> HTML:
|
||||||
|
return HTML(node_tree_to_html(self.root, depth = depth, debug = debug))
|
||||||
|
|
||||||
|
def _repr_html_(self) -> str:
|
||||||
|
return node_tree_to_html(self.root, depth = 2)
|
||||||
|
|
||||||
|
def __getitem__(self, args) -> 'CompressedTree':
|
||||||
|
key, value = args
|
||||||
|
for c in self.root.children:
|
||||||
|
if c.data.key == key and value in c.data.values:
|
||||||
|
data = dataclasses.replace(c.data, values = Enum((value,)))
|
||||||
|
return CompressedTree(
|
||||||
|
cache = self.cache,
|
||||||
|
root = dataclasses.replace(c, data = data)
|
||||||
|
)
|
||||||
|
raise KeyError(f"Key {key} not found in children.")
|
||||||
|
|
||||||
|
def collapse_children(self, node: "CompressedNode") -> "CompressedNode":
|
||||||
|
# First perform the collapse on the children
|
||||||
|
new_children = [self.collapse_children(child) for child in node.children]
|
||||||
|
|
||||||
|
# Now take the set of new children and see if any have identical key, metadata and children
|
||||||
|
# the values may different and will be collapsed into a single node
|
||||||
|
identical_children = defaultdict(set)
|
||||||
|
for child in new_children:
|
||||||
|
identical_children[(child.data.key, child.data.metadata, child._children)].add(child)
|
||||||
|
|
||||||
|
# Now go through and create new compressed nodes for any groups that need collapsing
|
||||||
|
new_children = []
|
||||||
|
for (key, metadata, _children), child_set in identical_children.items():
|
||||||
|
if len(child_set) > 1:
|
||||||
|
# Compress the children into a single node
|
||||||
|
assert all(isinstance(child.data.values, Enum) for child in child_set), "All children must have Enum values"
|
||||||
|
node_data = NodeData(
|
||||||
|
key = key,
|
||||||
|
metadata = frozendict(), # Todo: Implement metadata compression
|
||||||
|
values = Enum(tuple(v for child in child_set for v in child.data.values.values)),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add the node to the cache
|
||||||
|
id = type(self).add_to_cache(self.cache, node_data, _children)
|
||||||
|
else:
|
||||||
|
# If the group is size one just keep it
|
||||||
|
id = child_set.pop().id
|
||||||
|
|
||||||
|
new_children.append(id)
|
||||||
|
|
||||||
|
id = self.add_to_cache(self.cache, node.data, tuple(sorted(new_children)))
|
||||||
|
return self.cache[id]
|
||||||
|
|
||||||
|
|
||||||
|
def compress(self) -> 'CompressedTree':
|
||||||
|
return CompressedTree(cache = self.cache, root = self.collapse_children(self.root))
|
||||||
|
|
||||||
|
def lookup(self, selection : dict[str, str]):
|
||||||
|
nodes = [self.root]
|
||||||
|
for _ in range(1000):
|
||||||
|
found = False
|
||||||
|
current_node = nodes[-1]
|
||||||
|
for c in current_node.children:
|
||||||
|
if selection.get(c.data.key, None) in c.data.values:
|
||||||
|
if found:
|
||||||
|
raise RuntimeError("This tree is invalid, because it contains overlapping branches.")
|
||||||
|
nodes.append(c)
|
||||||
|
selection.pop(c.data.key)
|
||||||
|
found = True
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
raise RuntimeError("Maximum node searches exceeded, the tree contains a loop or something is buggy.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# def reconstruct(self) -> Tree:
|
||||||
|
# def reconstruct_node(h : int) -> Tree:
|
||||||
|
# node = self.cache[h]
|
||||||
|
# dedup : dict[tuple[int, str], set[NodeId]] = defaultdict(set)
|
||||||
|
# for index in self.cache[h].children:
|
||||||
|
# child_node = self.cache[index]
|
||||||
|
# child_hash = hash(child_node.children)
|
||||||
|
# assert isinstance(child_node.values, Enum)
|
||||||
|
# dedup[(child_hash, child_node.key)].add(index)
|
||||||
|
|
||||||
|
|
||||||
|
# children = tuple(
|
||||||
|
# Tree(key = key, values = Enum(tuple(values)),
|
||||||
|
# children = tuple(reconstruct_node(i) for i in self.cache[next(indices)].children)
|
||||||
|
# )
|
||||||
|
# for (_, key), indices in dedup.items()
|
||||||
|
# )
|
||||||
|
|
||||||
|
# return Tree(
|
||||||
|
# key = node.key,
|
||||||
|
# values = node.values,
|
||||||
|
# children = children,
|
||||||
|
# )
|
||||||
|
# return reconstruct_node(self.root)
|
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
Tree = dict[str, "Tree"]
|
Tree = dict[str, "Tree"]
|
||||||
@ -13,6 +14,11 @@ class RefcountedDict(dict[str, int]):
|
|||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return hash(tuple(sorted(self.items())))
|
return hash(tuple(sorted(self.items())))
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JSONNode:
|
||||||
|
key: str
|
||||||
|
values: list[str]
|
||||||
|
children: list["JSONNode"]
|
||||||
|
|
||||||
class CompressedTree():
|
class CompressedTree():
|
||||||
"""
|
"""
|
||||||
@ -101,6 +107,23 @@ class CompressedTree():
|
|||||||
return {f"{key}={','.join(values)}" : reconstruct_node(h, depth=depth+1) for (h, key), values in dedup.items()}
|
return {f"{key}={','.join(values)}" : reconstruct_node(h, depth=depth+1) for (h, key), values in dedup.items()}
|
||||||
return reconstruct_node(from_node or self.root_hash, depth=0)
|
return reconstruct_node(from_node or self.root_hash, depth=0)
|
||||||
|
|
||||||
|
def to_json(self, max_depth=None, from_node=None) -> dict:
|
||||||
|
def reconstruct_node(h : int, depth : int) -> list[JSONNode]:
|
||||||
|
if max_depth is not None and depth > max_depth:
|
||||||
|
return {}
|
||||||
|
dedup : dict[tuple[int, str], set[str]] = defaultdict(set)
|
||||||
|
for k, h2 in self.cache[h].items():
|
||||||
|
key, value = k.split("=")
|
||||||
|
dedup[(h2, key)].add(value)
|
||||||
|
|
||||||
|
return [JSONNode(
|
||||||
|
key = key,
|
||||||
|
values = list(values),
|
||||||
|
children = reconstruct_node(h, depth=depth+1),
|
||||||
|
) for (h, key), values in dedup.items()]
|
||||||
|
|
||||||
|
return asdict(reconstruct_node(from_node or self.root_hash, depth=0)[0])
|
||||||
|
|
||||||
def __init__(self, tree : Tree):
|
def __init__(self, tree : Tree):
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
self.empty_hash = hash(RefcountedDict({}))
|
self.empty_hash = hash(RefcountedDict({}))
|
||||||
@ -139,8 +162,8 @@ class CompressedTree():
|
|||||||
return list(loc.keys())
|
return list(loc.keys())
|
||||||
|
|
||||||
def multi_match(self, request : dict[str, list[str]], loc = None):
|
def multi_match(self, request : dict[str, list[str]], loc = None):
|
||||||
if not loc: return {"_END_" : {}}
|
|
||||||
if loc is None: loc = self.tree
|
if loc is None: loc = self.tree
|
||||||
|
if loc == {}: return {"_END_" : {}}
|
||||||
matches = {}
|
matches = {}
|
||||||
for request_key, request_values in request.items():
|
for request_key, request_values in request.items():
|
||||||
for request_value in request_values:
|
for request_value in request_values:
|
||||||
|
267
tree_compresser/python_src/tree_traverser/DataCubeTree.py
Normal file
267
tree_compresser/python_src/tree_traverser/DataCubeTree.py
Normal file
@ -0,0 +1,267 @@
|
|||||||
|
import dataclasses
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Callable, Hashable, Literal, Mapping
|
||||||
|
|
||||||
|
from frozendict import frozendict
|
||||||
|
|
||||||
|
from .tree_formatters import HTML, node_tree_to_html, node_tree_to_string
|
||||||
|
from .value_types import DateRange, Enum, IntRange, TimeRange, Values
|
||||||
|
|
||||||
|
|
||||||
|
def values_from_json(obj) -> Values:
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return Enum(tuple(obj))
|
||||||
|
|
||||||
|
match obj["dtype"]:
|
||||||
|
case "date": return DateRange(**obj)
|
||||||
|
case "time": return TimeRange(**obj)
|
||||||
|
case "int": return IntRange(**obj)
|
||||||
|
case _: raise ValueError(f"Unknown dtype {obj['dtype']}")
|
||||||
|
|
||||||
|
# In practice use a frozendict
|
||||||
|
Metadata = Mapping[str, str | int | float | bool]
|
||||||
|
|
||||||
|
@dataclass(frozen=True, eq=True, order=True)
|
||||||
|
class NodeData:
|
||||||
|
key: str
|
||||||
|
values: Values
|
||||||
|
metadata: dict[str, tuple[Hashable, ...]] = field(default_factory=frozendict, compare=False)
|
||||||
|
|
||||||
|
def summary(self) -> str:
|
||||||
|
return f"{self.key}={self.values.summary()}" if self.key != "root" else "root"
|
||||||
|
|
||||||
|
@dataclass(frozen=True, eq=True, order=True)
|
||||||
|
class Tree:
|
||||||
|
data: NodeData
|
||||||
|
children: tuple['Tree', ...]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def key(self) -> str:
|
||||||
|
return self.data.key
|
||||||
|
|
||||||
|
@property
|
||||||
|
def values(self) -> Values:
|
||||||
|
return self.data.values
|
||||||
|
|
||||||
|
@property
|
||||||
|
def metadata(self) -> frozendict[str, Any]:
|
||||||
|
return self.data.metadata
|
||||||
|
|
||||||
|
|
||||||
|
def summary(self) -> str:
|
||||||
|
return self.data.summary()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def make(cls, key : str, values : Values, children, **kwargs) -> 'Tree':
|
||||||
|
return cls(
|
||||||
|
data = NodeData(key, values, metadata = kwargs.get("metadata", frozendict())
|
||||||
|
),
|
||||||
|
children = tuple(sorted(children)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json(cls, json: dict) -> 'Tree':
|
||||||
|
def from_json(json: dict) -> Tree:
|
||||||
|
return Tree.make(
|
||||||
|
key=json["key"],
|
||||||
|
values=values_from_json(json["values"]),
|
||||||
|
metadata=json["metadata"] if "metadata" in json else {},
|
||||||
|
children=tuple(from_json(c) for c in json["children"])
|
||||||
|
)
|
||||||
|
return from_json(json)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, d: dict) -> 'Tree':
|
||||||
|
def from_dict(d: dict) -> tuple[Tree, ...]:
|
||||||
|
return tuple(Tree.make(
|
||||||
|
key=k.split("=")[0],
|
||||||
|
values=Enum(tuple(k.split("=")[1].split("/"))),
|
||||||
|
children=from_dict(children)
|
||||||
|
) for k, children in d.items())
|
||||||
|
|
||||||
|
return Tree.make(key = "root",
|
||||||
|
values=Enum(("root",)),
|
||||||
|
children = from_dict(d))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def empty(cls) -> 'Tree':
|
||||||
|
return cls.make("root", Enum(("root",)), [])
|
||||||
|
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "".join(node_tree_to_string(node=self))
|
||||||
|
|
||||||
|
def html(self, depth = 2, collapse = True) -> HTML:
|
||||||
|
return HTML(node_tree_to_html(self, depth = depth, collapse = collapse))
|
||||||
|
|
||||||
|
def _repr_html_(self) -> str:
|
||||||
|
return node_tree_to_html(self, depth = 2, collapse = True)
|
||||||
|
|
||||||
|
def __getitem__(self, args) -> 'Tree':
|
||||||
|
key, value = args
|
||||||
|
for c in self.children:
|
||||||
|
if c.key == key and value in c.values:
|
||||||
|
data = dataclasses.replace(c.data, values = Enum((value,)))
|
||||||
|
return dataclasses.replace(c, data = data)
|
||||||
|
raise KeyError(f"Key {key} not found in children of {self.key}")
|
||||||
|
|
||||||
|
|
||||||
|
def print(self, depth = None):
|
||||||
|
print("".join(cc for c in self.children for cc in node_tree_to_string(node=c, depth = depth)))
|
||||||
|
|
||||||
|
def transform(self, func: 'Callable[[Tree], Tree | list[Tree]]') -> 'Tree':
|
||||||
|
"""
|
||||||
|
Call a function on every node of the tree, return one or more nodes.
|
||||||
|
If multiple nodes are returned they each get a copy of the (transformed) children of the original node.
|
||||||
|
Any changes to the children of a node will be ignored.
|
||||||
|
"""
|
||||||
|
def transform(node: Tree) -> list[Tree]:
|
||||||
|
children = [cc for c in node.children for cc in transform(c)]
|
||||||
|
new_nodes = func(node)
|
||||||
|
if isinstance(new_nodes, Tree):
|
||||||
|
new_nodes = [new_nodes]
|
||||||
|
|
||||||
|
return [dataclasses.replace(new_node, children = children)
|
||||||
|
for new_node in new_nodes]
|
||||||
|
|
||||||
|
children = tuple(cc for c in self.children for cc in transform(c))
|
||||||
|
return dataclasses.replace(self, children = children)
|
||||||
|
|
||||||
|
def guess_datatypes(self) -> 'Tree':
|
||||||
|
def guess_datatypes(node: Tree) -> list[Tree]:
|
||||||
|
# Try to convert enum values into more structured types
|
||||||
|
children = tuple(cc for c in node.children for cc in guess_datatypes(c))
|
||||||
|
|
||||||
|
if isinstance(node.values, Enum):
|
||||||
|
match node.key:
|
||||||
|
case "time": range_class = TimeRange
|
||||||
|
case "date": range_class = DateRange
|
||||||
|
case _: range_class = None
|
||||||
|
|
||||||
|
if range_class is not None:
|
||||||
|
return [
|
||||||
|
dataclasses.replace(node, values = range, children = children)
|
||||||
|
for range in range_class.from_strings(node.values.values)
|
||||||
|
]
|
||||||
|
return [dataclasses.replace(node, children = children)]
|
||||||
|
|
||||||
|
children = tuple(cc for c in self.children for cc in guess_datatypes(c))
|
||||||
|
return dataclasses.replace(self, children = children)
|
||||||
|
|
||||||
|
|
||||||
|
def select(self, selection : dict[str, str | list[str]], mode: Literal["strict", "relaxed"] = "relaxed") -> 'Tree':
|
||||||
|
# make all values lists
|
||||||
|
selection = {k : v if isinstance(v, list) else [v] for k,v in selection.items()}
|
||||||
|
|
||||||
|
def not_none(xs): return tuple(x for x in xs if x is not None)
|
||||||
|
|
||||||
|
def select(node: Tree) -> Tree | None:
|
||||||
|
# Check if the key is specified in the selection
|
||||||
|
if node.key not in selection:
|
||||||
|
if mode == "strict":
|
||||||
|
return None
|
||||||
|
return dataclasses.replace(node, children = not_none(select(c) for c in node.children))
|
||||||
|
|
||||||
|
# If the key is specified, check if any of the values match
|
||||||
|
values = Enum(tuple(c for c in selection[node.key] if c in node.values))
|
||||||
|
|
||||||
|
if not values:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return dataclasses.replace(node, values = values, children = not_none(select(c) for c in node.children))
|
||||||
|
|
||||||
|
return dataclasses.replace(self, children = not_none(select(c) for c in self.children))
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _insert(position: "Tree", identifier : list[tuple[str, list[str]]]):
|
||||||
|
"""
|
||||||
|
This algorithm goes as follows:
|
||||||
|
We're at a particular node in the tree, and we have a list of key-values pairs that we want to insert.
|
||||||
|
We take the first key values pair
|
||||||
|
key, values = identifier.pop(0)
|
||||||
|
|
||||||
|
The general idea is to insert key, values into the current node and use recursion to handle the rest of the identifier.
|
||||||
|
|
||||||
|
We have two sources of values with possible overlap. The values to insert and the values attached to the children of this node.
|
||||||
|
For each value coming from either source we put it in one of three categories:
|
||||||
|
1) Values that exist only in the already existing child. (Coming exclusively from position.children)
|
||||||
|
2) Values that exist in both a child and the new values.
|
||||||
|
3) Values that exist only in the new values.
|
||||||
|
|
||||||
|
|
||||||
|
Thus we add the values to insert to a set, and loop over the children.
|
||||||
|
For each child we partition its values into the three categories.
|
||||||
|
|
||||||
|
For 1) we create a new child node with the key, reduced set of values and the same children.
|
||||||
|
For 2)
|
||||||
|
Create a new child node with the key, and the values in group 2
|
||||||
|
Recurse to compute the children
|
||||||
|
|
||||||
|
Once we have finished looping over children we know all the values left over came exclusively from the new values.
|
||||||
|
So we:
|
||||||
|
Create a new node with these values.
|
||||||
|
Recurse to compute the children
|
||||||
|
|
||||||
|
Finally we return the node with all these new children.
|
||||||
|
"""
|
||||||
|
if not identifier:
|
||||||
|
return position
|
||||||
|
|
||||||
|
key, values = identifier.pop(0)
|
||||||
|
# print(f"Inserting {key}={values} into {position.summary()}")
|
||||||
|
|
||||||
|
# Determine which children have this key
|
||||||
|
possible_children = {c : [] for c in position.children if c.key == key}
|
||||||
|
entirely_new_values = []
|
||||||
|
|
||||||
|
# For each value check it is already in one of the children
|
||||||
|
for v in values:
|
||||||
|
for c in possible_children:
|
||||||
|
if v in c.values:
|
||||||
|
possible_children[c].append(v)
|
||||||
|
break
|
||||||
|
else: # only executed if the loop did not break
|
||||||
|
# If none of the children have this value, add it to the new child pile
|
||||||
|
entirely_new_values.append(v)
|
||||||
|
|
||||||
|
# d = {p.summary() : v for p, v in possible_children.items()}
|
||||||
|
# print(f" {d} new_values={entirely_new_values}")
|
||||||
|
|
||||||
|
new_children = []
|
||||||
|
for c, affected in possible_children.items():
|
||||||
|
if not affected:
|
||||||
|
new_children.append(c)
|
||||||
|
continue
|
||||||
|
|
||||||
|
unaffected = [x for x in c.values if x not in affected]
|
||||||
|
if unaffected:
|
||||||
|
unaffected_node = Tree.make(c.key, Enum(tuple(unaffected)), c.children)
|
||||||
|
new_children.append(unaffected_node) # Add the unaffected part of this child
|
||||||
|
|
||||||
|
if affected: # This check is not technically necessary, but it makes the code more readable
|
||||||
|
new_node = Tree.make(key, Enum(tuple(affected)), [])
|
||||||
|
new_node = Tree._insert(new_node, identifier)
|
||||||
|
new_children.append(new_node) # Add the affected part of this child
|
||||||
|
|
||||||
|
# If there are any values not in any of the existing children, add them as a new child
|
||||||
|
if entirely_new_values:
|
||||||
|
new_node = Tree.make(key, Enum(tuple(entirely_new_values)), [])
|
||||||
|
new_children.append(Tree._insert(new_node, identifier))
|
||||||
|
|
||||||
|
return Tree.make(position.key, position.values, new_children)
|
||||||
|
|
||||||
|
def insert(self, identifier : dict[str, list[str]]) -> 'Tree':
|
||||||
|
insertion = [(k, v) for k, v in identifier.items()]
|
||||||
|
return Tree._insert(self, insertion)
|
||||||
|
|
||||||
|
def to_list_of_cubes(self):
|
||||||
|
def to_list_of_cubes(node: Tree) -> list[list[Tree]]:
|
||||||
|
return [[node] + sub_cube for c in node.children for sub_cube in to_list_of_cubes(c)]
|
||||||
|
|
||||||
|
return to_list_of_cubes(self)
|
||||||
|
|
||||||
|
def info(self):
|
||||||
|
cubes = self.to_list_of_cubes()
|
||||||
|
print(f"Number of distinct paths: {len(cubes)}")
|
116
tree_compresser/python_src/tree_traverser/tree_formatters.py
Normal file
116
tree_compresser/python_src/tree_traverser/tree_formatters.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Iterable, Protocol, Sequence, runtime_checkable
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class TreeLike(Protocol):
|
||||||
|
@property
|
||||||
|
def children(self) -> Sequence["TreeLike"]: ... # Supports indexing like node.children[i]
|
||||||
|
|
||||||
|
def summary(self, **kwargs) -> str: ...
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class HTML():
|
||||||
|
html: str
|
||||||
|
def _repr_html_(self):
|
||||||
|
return self.html
|
||||||
|
|
||||||
|
def summarize_node(node: TreeLike, collapse = False, **kwargs) -> tuple[str, TreeLike]:
|
||||||
|
"""
|
||||||
|
Extracts a summarized representation of the node while collapsing single-child paths.
|
||||||
|
Returns the summary string and the last node in the chain that has multiple children.
|
||||||
|
"""
|
||||||
|
summaries = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
summary = node.summary(**kwargs)
|
||||||
|
if len(summary) > 50:
|
||||||
|
summary = summary[:50] + "..."
|
||||||
|
summaries.append(summary)
|
||||||
|
if not collapse:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Move down if there's exactly one child, otherwise stop
|
||||||
|
if len(node.children) != 1:
|
||||||
|
break
|
||||||
|
node = node.children[0]
|
||||||
|
|
||||||
|
return ", ".join(summaries), node
|
||||||
|
|
||||||
|
def node_tree_to_string(node : TreeLike, prefix : str = "", depth = None) -> Iterable[str]:
|
||||||
|
summary, node = summarize_node(node)
|
||||||
|
|
||||||
|
if depth is not None and depth <= 0:
|
||||||
|
yield summary + " - ...\n"
|
||||||
|
return
|
||||||
|
# Special case for nodes with only a single child, this makes the printed representation more compact
|
||||||
|
elif len(node.children) == 1:
|
||||||
|
yield summary + ", "
|
||||||
|
yield from node_tree_to_string(node.children[0], prefix, depth = depth)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
yield summary + "\n"
|
||||||
|
|
||||||
|
for index, child in enumerate(node.children):
|
||||||
|
connector = "└── " if index == len(node.children) - 1 else "├── "
|
||||||
|
yield prefix + connector
|
||||||
|
extension = " " if index == len(node.children) - 1 else "│ "
|
||||||
|
yield from node_tree_to_string(child, prefix + extension, depth = depth - 1 if depth is not None else None)
|
||||||
|
|
||||||
|
def _node_tree_to_html(node : TreeLike, prefix : str = "", depth = 1, connector = "", **kwargs) -> Iterable[str]:
|
||||||
|
summary, node = summarize_node(node, **kwargs)
|
||||||
|
|
||||||
|
if len(node.children) == 0:
|
||||||
|
yield f'<span class="leaf">{connector}{summary}</span>'
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
open = "open" if depth > 0 else ""
|
||||||
|
yield f"<details {open}><summary>{connector}{summary}</summary>"
|
||||||
|
|
||||||
|
for index, child in enumerate(node.children):
|
||||||
|
connector = "└── " if index == len(node.children) - 1 else "├── "
|
||||||
|
extension = " " if index == len(node.children) - 1 else "│ "
|
||||||
|
yield from _node_tree_to_html(child, prefix + extension, depth = depth - 1, connector = prefix+connector, **kwargs)
|
||||||
|
yield "</details>"
|
||||||
|
|
||||||
|
def node_tree_to_html(node : TreeLike, depth = 1, **kwargs) -> str:
|
||||||
|
css = """
|
||||||
|
<style>
|
||||||
|
.qubed-tree-view {
|
||||||
|
font-family: monospace;
|
||||||
|
white-space: pre;
|
||||||
|
}
|
||||||
|
.qubed-tree-view details {
|
||||||
|
# display: inline;
|
||||||
|
margin-left: 0;
|
||||||
|
}
|
||||||
|
.qubed-tree-view summary {
|
||||||
|
list-style: none;
|
||||||
|
cursor: pointer;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
overflow: hidden;
|
||||||
|
text-wrap: nowrap;
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
.qubed-tree-view .leaf {
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
overflow: hidden;
|
||||||
|
text-wrap: nowrap;
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
.qubed-tree-view summary:hover,span.leaf:hover {
|
||||||
|
background-color: #f0f0f0;
|
||||||
|
}
|
||||||
|
.qubed-tree-view details > summary::after {
|
||||||
|
content: ' ▲';
|
||||||
|
}
|
||||||
|
.qubed-tree-view details:not([open]) > summary::after {
|
||||||
|
content: " ▼";
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
"""
|
||||||
|
nodes = "".join(_node_tree_to_html(node=node, depth=depth, **kwargs))
|
||||||
|
return f"{css}<pre class='qubed-tree-view'>{nodes}</pre>"
|
40
tree_compresser/python_src/tree_traverser/trie.py
Normal file
40
tree_compresser/python_src/tree_traverser/trie.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
character = str
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class TrieNode():
|
||||||
|
parent: "TrieNode | None"
|
||||||
|
parent_char: character
|
||||||
|
children: dict[character, "TrieNode"] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Trie:
|
||||||
|
root: TrieNode = field(default_factory=lambda: TrieNode(None, ""))
|
||||||
|
reverse_lookup: dict[int, TrieNode] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def insert(self, word: str):
|
||||||
|
node = self.root
|
||||||
|
for char in word:
|
||||||
|
if char not in node.children:
|
||||||
|
new_node = TrieNode(node, char)
|
||||||
|
node.children[char] = new_node
|
||||||
|
|
||||||
|
node = node.children[char]
|
||||||
|
|
||||||
|
n_id = id(node)
|
||||||
|
if n_id not in self.reverse_lookup:
|
||||||
|
self.reverse_lookup[n_id] = node
|
||||||
|
|
||||||
|
return n_id
|
||||||
|
|
||||||
|
def lookup_by_id(self, n_id: int):
|
||||||
|
leaf_node = self.reverse_lookup[n_id]
|
||||||
|
string = []
|
||||||
|
while leaf_node.parent is not None:
|
||||||
|
string.append(leaf_node.parent_char)
|
||||||
|
leaf_node = leaf_node.parent
|
||||||
|
|
||||||
|
return "".join(reversed(string))
|
||||||
|
|
214
tree_compresser/python_src/tree_traverser/value_types.py
Normal file
214
tree_compresser/python_src/tree_traverser/value_types.py
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
import dataclasses
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import date, datetime, timedelta
|
||||||
|
from typing import Any, Iterable, Literal
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Values(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def summary(self) -> str:
|
||||||
|
pass
|
||||||
|
@abstractmethod
|
||||||
|
def __len__(self) -> int:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def __contains__(self, value: Any) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def from_strings(self, values: Iterable[str]) -> list['Values']:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@dataclass(frozen=True, order=True)
|
||||||
|
class Enum(Values):
|
||||||
|
"""
|
||||||
|
The simplest kind of key value is just a list of strings.
|
||||||
|
summary -> string1/string2/string....
|
||||||
|
"""
|
||||||
|
values: tuple[Any, ...]
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
assert isinstance(self.values, tuple)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self.values)
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self.values)
|
||||||
|
def summary(self) -> str:
|
||||||
|
return '/'.join(map(str, sorted(self.values)))
|
||||||
|
def __contains__(self, value: Any) -> bool:
|
||||||
|
return value in self.values
|
||||||
|
def from_strings(self, values: Iterable[str]) -> list['Values']:
|
||||||
|
return [Enum(tuple(values))]
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Range(Values, ABC):
|
||||||
|
dtype: str = dataclasses.field(kw_only=True)
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DateRange(Range):
|
||||||
|
start: date
|
||||||
|
end: date
|
||||||
|
step: timedelta
|
||||||
|
dtype: Literal["date"] = dataclasses.field(kw_only=True, default="date")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_strings(self, values: Iterable[str]) -> list['DateRange']:
|
||||||
|
dates = sorted([datetime.strptime(v, "%Y%m%d") for v in values])
|
||||||
|
if len(dates) < 2:
|
||||||
|
return [DateRange(
|
||||||
|
start=dates[0],
|
||||||
|
end=dates[0],
|
||||||
|
step=timedelta(days=0)
|
||||||
|
)]
|
||||||
|
|
||||||
|
ranges = []
|
||||||
|
current_range, dates = [dates[0],], dates[1:]
|
||||||
|
while len(dates) > 1:
|
||||||
|
if dates[0] - current_range[-1] == timedelta(days=1):
|
||||||
|
current_range.append(dates.pop(0))
|
||||||
|
|
||||||
|
elif len(current_range) == 1:
|
||||||
|
ranges.append(DateRange(
|
||||||
|
start=current_range[0],
|
||||||
|
end=current_range[0],
|
||||||
|
step=timedelta(days=0)
|
||||||
|
))
|
||||||
|
current_range = [dates.pop(0),]
|
||||||
|
|
||||||
|
else:
|
||||||
|
ranges.append(DateRange(
|
||||||
|
start=current_range[0],
|
||||||
|
end=current_range[-1],
|
||||||
|
step=timedelta(days=1)
|
||||||
|
))
|
||||||
|
current_range = [dates.pop(0),]
|
||||||
|
return ranges
|
||||||
|
|
||||||
|
def __contains__(self, value: Any) -> bool:
|
||||||
|
v = datetime.strptime(value, "%Y%m%d").date()
|
||||||
|
return self.start <= v <= self.end and (v - self.start) % self.step == 0
|
||||||
|
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return (self.end - self.start) // self.step
|
||||||
|
|
||||||
|
def summary(self) -> str:
|
||||||
|
def fmt(d): return d.strftime("%Y%m%d")
|
||||||
|
if self.step == timedelta(days=0):
|
||||||
|
return f"{fmt(self.start)}"
|
||||||
|
if self.step == timedelta(days=1):
|
||||||
|
return f"{fmt(self.start)}/to/{fmt(self.end)}"
|
||||||
|
|
||||||
|
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step // timedelta(days=1)}"
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TimeRange(Range):
|
||||||
|
start: int
|
||||||
|
end: int
|
||||||
|
step: int
|
||||||
|
dtype: Literal["time"] = dataclasses.field(kw_only=True, default="time")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_strings(self, values: Iterable[str]) -> list['TimeRange']:
|
||||||
|
if len(values) == 0: return []
|
||||||
|
|
||||||
|
times = sorted([int(v) for v in values])
|
||||||
|
if len(times) < 2:
|
||||||
|
return [TimeRange(
|
||||||
|
start=times[0],
|
||||||
|
end=times[0],
|
||||||
|
step=100
|
||||||
|
)]
|
||||||
|
|
||||||
|
ranges = []
|
||||||
|
current_range, times = [times[0],], times[1:]
|
||||||
|
while len(times) > 1:
|
||||||
|
if times[0] - current_range[-1] == 1:
|
||||||
|
current_range.append(times.pop(0))
|
||||||
|
|
||||||
|
elif len(current_range) == 1:
|
||||||
|
ranges.append(TimeRange(
|
||||||
|
start=current_range[0],
|
||||||
|
end=current_range[0],
|
||||||
|
step=0
|
||||||
|
))
|
||||||
|
current_range = [times.pop(0),]
|
||||||
|
|
||||||
|
else:
|
||||||
|
ranges.append(TimeRange(
|
||||||
|
start=current_range[0],
|
||||||
|
end=current_range[-1],
|
||||||
|
step=1
|
||||||
|
))
|
||||||
|
current_range = [times.pop(0),]
|
||||||
|
return ranges
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return (self.end - self.start) // self.step
|
||||||
|
|
||||||
|
def summary(self) -> str:
|
||||||
|
def fmt(d): return f"{d:04d}"
|
||||||
|
if self.step == 0:
|
||||||
|
return f"{fmt(self.start)}"
|
||||||
|
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
|
||||||
|
|
||||||
|
def __contains__(self, value: Any) -> bool:
|
||||||
|
v = int(value)
|
||||||
|
return self.start <= v <= self.end and (v - self.start) % self.step == 0
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IntRange(Range):
|
||||||
|
start: int
|
||||||
|
end: int
|
||||||
|
step: int
|
||||||
|
dtype: Literal["int"] = dataclasses.field(kw_only=True, default="int")
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return (self.end - self.start) // self.step
|
||||||
|
|
||||||
|
def summary(self) -> str:
|
||||||
|
def fmt(d): return d.strftime("%Y%m%d")
|
||||||
|
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
|
||||||
|
|
||||||
|
def __contains__(self, value: Any) -> bool:
|
||||||
|
v = int(value)
|
||||||
|
return self.start <= v <= self.end and (v - self.start) % self.step == 0
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_strings(self, values: Iterable[str]) -> list['IntRange']:
|
||||||
|
if len(values) == 0: return []
|
||||||
|
ints = sorted([int(v) for v in values])
|
||||||
|
if len(ints) < 2:
|
||||||
|
return [IntRange(
|
||||||
|
start=ints[0],
|
||||||
|
end=ints[0],
|
||||||
|
step=0
|
||||||
|
)]
|
||||||
|
|
||||||
|
ranges = []
|
||||||
|
current_range, ints = [ints[0],], ints[1:]
|
||||||
|
while len(ints) > 1:
|
||||||
|
if ints[0] - current_range[-1] == 1:
|
||||||
|
current_range.append(ints.pop(0))
|
||||||
|
|
||||||
|
elif len(current_range) == 1:
|
||||||
|
ranges.append(IntRange(
|
||||||
|
start=current_range[0],
|
||||||
|
end=current_range[0],
|
||||||
|
step=0
|
||||||
|
))
|
||||||
|
current_range = [ints.pop(0),]
|
||||||
|
|
||||||
|
else:
|
||||||
|
ranges.append(IntRange(
|
||||||
|
start=current_range[0],
|
||||||
|
end=current_range[-1],
|
||||||
|
step=1
|
||||||
|
))
|
||||||
|
current_range = [ints.pop(0),]
|
||||||
|
return ranges
|
@ -3,15 +3,15 @@ from pathlib import Path
|
|||||||
|
|
||||||
from tree_traverser import CompressedTree
|
from tree_traverser import CompressedTree
|
||||||
|
|
||||||
data_path = Path("/home/eouser/qubed/config/climate-dt/compressed_tree.json")
|
data_path = Path("./config/climate-dt/compressed_tree.json")
|
||||||
# Print size of file
|
# Print size of file
|
||||||
print(f"climate dt compressed tree: {data_path.stat().st_size // 1e6:.1f} MB")
|
print(f"climate dt compressed tree: {data_path.stat().st_size // 1e6:.1f} MB")
|
||||||
|
|
||||||
print("Opening json file")
|
print("Opening json file")
|
||||||
compressed_tree = CompressedTree.load(data_path)
|
compressed_tree = CompressedTree.load(data_path)
|
||||||
|
|
||||||
print(compressed_tree.reconstruct_compressed_ecmwf_style())
|
print(compressed_tree.to_json())
|
||||||
|
|
||||||
# print("Outputting compressed tree ecmwf style")
|
print("Outputting compressed tree ecmwf style")
|
||||||
# with open("data/compressed_tree_climate_dt_ecmwf_style.json", "w") as f:
|
with open("config/climate-dt/new_format.json", "w") as f:
|
||||||
# json.dump(compressed_tree.reconstruct_compressed_ecmwf_style(), f)
|
json.dump(compressed_tree.to_json(), f)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user