updates
This commit is contained in:
parent
81a478a58f
commit
609e3e9f74
4
tree_compresser/Cargo.lock
generated
4
tree_compresser/Cargo.lock
generated
@ -1,6 +1,6 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
@ -359,7 +359,6 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
[[package]]
|
||||
name = "rsfdb"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/ecmwf/rsfdb?branch=develop#ab8c9590bba15d22167c274db9238cd9b897baf1"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"libloading",
|
||||
@ -372,7 +371,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "rsfindlibs"
|
||||
version = "0.1.1"
|
||||
source = "git+https://github.com/ecmwf-projects/rsfindlibs.git#1358b1049bf3e0b581badfc8005a9828a542cdaa"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"clap",
|
||||
|
@ -1,7 +1,8 @@
|
||||
[package]
|
||||
name = "qubed_tree"
|
||||
version = "0.1.0"
|
||||
name = "qubed"
|
||||
version = "0.1.2"
|
||||
edition = "2021"
|
||||
repository = "https://github.com/ecmwf/qubed"
|
||||
|
||||
[dependencies]
|
||||
rsfdb = {git = "https://github.com/ecmwf/rsfdb", branch = "develop"}
|
||||
@ -16,7 +17,7 @@ crate-type = ["cdylib"]
|
||||
path = "./rust_src/lib.rs"
|
||||
|
||||
[patch.'https://github.com/ecmwf/rsfdb']
|
||||
rsfdb = { path = "../rsfdb" }
|
||||
rsfdb = { path = "../../rsfdb" }
|
||||
|
||||
[patch.'https://github.com/ecmwf-projects/rsfindlibs']
|
||||
rsfindlibs = { path = "../rsfindlibs" }
|
||||
rsfindlibs = { path = "../../rsfindlibs" }
|
@ -0,0 +1,216 @@
|
||||
import dataclasses
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from frozendict import frozendict
|
||||
|
||||
from .DataCubeTree import Enum, NodeData, Tree
|
||||
from .tree_formatters import HTML, node_tree_to_html, node_tree_to_string
|
||||
|
||||
NodeId = int
|
||||
CacheType = dict[NodeId, "CompressedNode"]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CompressedNode:
|
||||
id: NodeId = field(hash=False, compare=False)
|
||||
data: NodeData
|
||||
|
||||
_children: tuple[NodeId, ...]
|
||||
_cache: CacheType = field(repr=False, hash=False, compare=False)
|
||||
|
||||
@property
|
||||
def children(self) -> tuple["CompressedNode", ...]:
|
||||
return tuple(self._cache[i] for i in self._children)
|
||||
|
||||
def summary(self, debug = False) -> str:
|
||||
if debug: return f"{self.data.key}={self.data.values.summary()} ({self.id})"
|
||||
return f"{self.data.key}={self.data.values.summary()}" if self.data.key != "root" else "root"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CompressedTree:
|
||||
"""
|
||||
This tree is compressed in two distinct different ways:
|
||||
1. Product Compression: Nodes have a key and **multiple values**, so each node represents many logical nodes key=value1, key=value2, ...
|
||||
Each of these logical nodes is has identical children so we can compress them like this.
|
||||
In this way any distinct path through the tree represents a cartesian product of the values, otherwise known as a datacube.
|
||||
|
||||
2. In order to facilitate the product compression described above we need to know when two nodes have identical children.
|
||||
To do this every node is assigned an Id which is initially computed as a hash from the nodes data and its childrens' ids.
|
||||
In order to avoid hash collisions we increment the initial hash if it's already in the cache for a different node
|
||||
we do this until we find a unique id.
|
||||
|
||||
Crucially this allows us to later determine if a new node is already cached:
|
||||
id = hash(node)
|
||||
while True:
|
||||
if id not in cache: The node is definitely not in the cache
|
||||
elif cache[id] != node: Hash collision, increment id and try again
|
||||
else: The node is already in the cache
|
||||
id += 1
|
||||
|
||||
This tree can be walked from the root by repeatedly looking up the children of a node in the cache.
|
||||
|
||||
This structure facilitates compression because we can look at the children of a node:
|
||||
If two chidren have the same key, metadata and children then we can compress them into a single node.
|
||||
|
||||
"""
|
||||
root: CompressedNode
|
||||
cache: CacheType
|
||||
|
||||
@staticmethod
|
||||
def add_to_cache(cache : dict[NodeId, CompressedNode], data : NodeData, _children: tuple[NodeId, ...]) -> NodeId:
|
||||
"""
|
||||
This function is responsible for adding a new node to the cache and returning its id.
|
||||
Crucially we need a way to check if new nodes are already in the cache, so we hash them.
|
||||
But in case of a hash collision we need to increment the id and try again.
|
||||
This way we will always eventually find a unique id for the node.
|
||||
And we will never store the same node twice with a different id.
|
||||
"""
|
||||
_children = tuple(sorted(_children))
|
||||
id = hash((data, _children))
|
||||
|
||||
# To avoid hash collisions, we increment the id until we find a unique one
|
||||
tries = 0
|
||||
while True:
|
||||
tries += 1
|
||||
if id not in cache:
|
||||
# The node isn't in the cache and this id is free
|
||||
cache[id] = CompressedNode(id = id,
|
||||
data = data,
|
||||
_children = _children,
|
||||
_cache = cache)
|
||||
break
|
||||
|
||||
if cache[id].data == data and cache[id]._children == _children:
|
||||
break # The node is already in the cache
|
||||
|
||||
# This id is already in use by a different node so increment it (mod) and try again
|
||||
id = (id + 1) % (2**64)
|
||||
|
||||
if tries > 100:
|
||||
raise RuntimeError("Too many hash collisions, something is wrong.")
|
||||
|
||||
return id
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_tree(cls, tree : Tree) -> 'CompressedTree':
|
||||
cache = {}
|
||||
|
||||
def cache_tree(level : Tree) -> NodeId:
|
||||
node_data = NodeData(
|
||||
key = level.key,
|
||||
values = level.values,
|
||||
)
|
||||
|
||||
# Recursively cache the children
|
||||
children = tuple(cache_tree(c) for c in level.children)
|
||||
|
||||
# Add the node to the cache and return its id
|
||||
return cls.add_to_cache(cache, node_data, children)
|
||||
|
||||
root = cache_tree(tree)
|
||||
return cls(cache = cache, root = cache[root])
|
||||
|
||||
def __str__(self):
|
||||
return "".join(node_tree_to_string(self.root))
|
||||
|
||||
def html(self, depth = 2, debug = False) -> HTML:
|
||||
return HTML(node_tree_to_html(self.root, depth = depth, debug = debug))
|
||||
|
||||
def _repr_html_(self) -> str:
|
||||
return node_tree_to_html(self.root, depth = 2)
|
||||
|
||||
def __getitem__(self, args) -> 'CompressedTree':
|
||||
key, value = args
|
||||
for c in self.root.children:
|
||||
if c.data.key == key and value in c.data.values:
|
||||
data = dataclasses.replace(c.data, values = Enum((value,)))
|
||||
return CompressedTree(
|
||||
cache = self.cache,
|
||||
root = dataclasses.replace(c, data = data)
|
||||
)
|
||||
raise KeyError(f"Key {key} not found in children.")
|
||||
|
||||
def collapse_children(self, node: "CompressedNode") -> "CompressedNode":
|
||||
# First perform the collapse on the children
|
||||
new_children = [self.collapse_children(child) for child in node.children]
|
||||
|
||||
# Now take the set of new children and see if any have identical key, metadata and children
|
||||
# the values may different and will be collapsed into a single node
|
||||
identical_children = defaultdict(set)
|
||||
for child in new_children:
|
||||
identical_children[(child.data.key, child.data.metadata, child._children)].add(child)
|
||||
|
||||
# Now go through and create new compressed nodes for any groups that need collapsing
|
||||
new_children = []
|
||||
for (key, metadata, _children), child_set in identical_children.items():
|
||||
if len(child_set) > 1:
|
||||
# Compress the children into a single node
|
||||
assert all(isinstance(child.data.values, Enum) for child in child_set), "All children must have Enum values"
|
||||
node_data = NodeData(
|
||||
key = key,
|
||||
metadata = frozendict(), # Todo: Implement metadata compression
|
||||
values = Enum(tuple(v for child in child_set for v in child.data.values.values)),
|
||||
)
|
||||
|
||||
# Add the node to the cache
|
||||
id = type(self).add_to_cache(self.cache, node_data, _children)
|
||||
else:
|
||||
# If the group is size one just keep it
|
||||
id = child_set.pop().id
|
||||
|
||||
new_children.append(id)
|
||||
|
||||
id = self.add_to_cache(self.cache, node.data, tuple(sorted(new_children)))
|
||||
return self.cache[id]
|
||||
|
||||
|
||||
def compress(self) -> 'CompressedTree':
|
||||
return CompressedTree(cache = self.cache, root = self.collapse_children(self.root))
|
||||
|
||||
def lookup(self, selection : dict[str, str]):
|
||||
nodes = [self.root]
|
||||
for _ in range(1000):
|
||||
found = False
|
||||
current_node = nodes[-1]
|
||||
for c in current_node.children:
|
||||
if selection.get(c.data.key, None) in c.data.values:
|
||||
if found:
|
||||
raise RuntimeError("This tree is invalid, because it contains overlapping branches.")
|
||||
nodes.append(c)
|
||||
selection.pop(c.data.key)
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
return nodes
|
||||
|
||||
raise RuntimeError("Maximum node searches exceeded, the tree contains a loop or something is buggy.")
|
||||
|
||||
|
||||
|
||||
|
||||
# def reconstruct(self) -> Tree:
|
||||
# def reconstruct_node(h : int) -> Tree:
|
||||
# node = self.cache[h]
|
||||
# dedup : dict[tuple[int, str], set[NodeId]] = defaultdict(set)
|
||||
# for index in self.cache[h].children:
|
||||
# child_node = self.cache[index]
|
||||
# child_hash = hash(child_node.children)
|
||||
# assert isinstance(child_node.values, Enum)
|
||||
# dedup[(child_hash, child_node.key)].add(index)
|
||||
|
||||
|
||||
# children = tuple(
|
||||
# Tree(key = key, values = Enum(tuple(values)),
|
||||
# children = tuple(reconstruct_node(i) for i in self.cache[next(indices)].children)
|
||||
# )
|
||||
# for (_, key), indices in dedup.items()
|
||||
# )
|
||||
|
||||
# return Tree(
|
||||
# key = node.key,
|
||||
# values = node.values,
|
||||
# children = children,
|
||||
# )
|
||||
# return reconstruct_node(self.root)
|
@ -1,5 +1,6 @@
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
|
||||
Tree = dict[str, "Tree"]
|
||||
@ -13,6 +14,11 @@ class RefcountedDict(dict[str, int]):
|
||||
def __hash__(self):
|
||||
return hash(tuple(sorted(self.items())))
|
||||
|
||||
@dataclass
|
||||
class JSONNode:
|
||||
key: str
|
||||
values: list[str]
|
||||
children: list["JSONNode"]
|
||||
|
||||
class CompressedTree():
|
||||
"""
|
||||
@ -101,6 +107,23 @@ class CompressedTree():
|
||||
return {f"{key}={','.join(values)}" : reconstruct_node(h, depth=depth+1) for (h, key), values in dedup.items()}
|
||||
return reconstruct_node(from_node or self.root_hash, depth=0)
|
||||
|
||||
def to_json(self, max_depth=None, from_node=None) -> dict:
|
||||
def reconstruct_node(h : int, depth : int) -> list[JSONNode]:
|
||||
if max_depth is not None and depth > max_depth:
|
||||
return {}
|
||||
dedup : dict[tuple[int, str], set[str]] = defaultdict(set)
|
||||
for k, h2 in self.cache[h].items():
|
||||
key, value = k.split("=")
|
||||
dedup[(h2, key)].add(value)
|
||||
|
||||
return [JSONNode(
|
||||
key = key,
|
||||
values = list(values),
|
||||
children = reconstruct_node(h, depth=depth+1),
|
||||
) for (h, key), values in dedup.items()]
|
||||
|
||||
return asdict(reconstruct_node(from_node or self.root_hash, depth=0)[0])
|
||||
|
||||
def __init__(self, tree : Tree):
|
||||
self.cache = {}
|
||||
self.empty_hash = hash(RefcountedDict({}))
|
||||
@ -139,8 +162,8 @@ class CompressedTree():
|
||||
return list(loc.keys())
|
||||
|
||||
def multi_match(self, request : dict[str, list[str]], loc = None):
|
||||
if not loc: return {"_END_" : {}}
|
||||
if loc is None: loc = self.tree
|
||||
if loc == {}: return {"_END_" : {}}
|
||||
matches = {}
|
||||
for request_key, request_values in request.items():
|
||||
for request_value in request_values:
|
||||
|
267
tree_compresser/python_src/tree_traverser/DataCubeTree.py
Normal file
267
tree_compresser/python_src/tree_traverser/DataCubeTree.py
Normal file
@ -0,0 +1,267 @@
|
||||
import dataclasses
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable, Hashable, Literal, Mapping
|
||||
|
||||
from frozendict import frozendict
|
||||
|
||||
from .tree_formatters import HTML, node_tree_to_html, node_tree_to_string
|
||||
from .value_types import DateRange, Enum, IntRange, TimeRange, Values
|
||||
|
||||
|
||||
def values_from_json(obj) -> Values:
|
||||
if isinstance(obj, list):
|
||||
return Enum(tuple(obj))
|
||||
|
||||
match obj["dtype"]:
|
||||
case "date": return DateRange(**obj)
|
||||
case "time": return TimeRange(**obj)
|
||||
case "int": return IntRange(**obj)
|
||||
case _: raise ValueError(f"Unknown dtype {obj['dtype']}")
|
||||
|
||||
# In practice use a frozendict
|
||||
Metadata = Mapping[str, str | int | float | bool]
|
||||
|
||||
@dataclass(frozen=True, eq=True, order=True)
|
||||
class NodeData:
|
||||
key: str
|
||||
values: Values
|
||||
metadata: dict[str, tuple[Hashable, ...]] = field(default_factory=frozendict, compare=False)
|
||||
|
||||
def summary(self) -> str:
|
||||
return f"{self.key}={self.values.summary()}" if self.key != "root" else "root"
|
||||
|
||||
@dataclass(frozen=True, eq=True, order=True)
|
||||
class Tree:
|
||||
data: NodeData
|
||||
children: tuple['Tree', ...]
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
return self.data.key
|
||||
|
||||
@property
|
||||
def values(self) -> Values:
|
||||
return self.data.values
|
||||
|
||||
@property
|
||||
def metadata(self) -> frozendict[str, Any]:
|
||||
return self.data.metadata
|
||||
|
||||
|
||||
def summary(self) -> str:
|
||||
return self.data.summary()
|
||||
|
||||
@classmethod
|
||||
def make(cls, key : str, values : Values, children, **kwargs) -> 'Tree':
|
||||
return cls(
|
||||
data = NodeData(key, values, metadata = kwargs.get("metadata", frozendict())
|
||||
),
|
||||
children = tuple(sorted(children)),
|
||||
)
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json: dict) -> 'Tree':
|
||||
def from_json(json: dict) -> Tree:
|
||||
return Tree.make(
|
||||
key=json["key"],
|
||||
values=values_from_json(json["values"]),
|
||||
metadata=json["metadata"] if "metadata" in json else {},
|
||||
children=tuple(from_json(c) for c in json["children"])
|
||||
)
|
||||
return from_json(json)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict) -> 'Tree':
|
||||
def from_dict(d: dict) -> tuple[Tree, ...]:
|
||||
return tuple(Tree.make(
|
||||
key=k.split("=")[0],
|
||||
values=Enum(tuple(k.split("=")[1].split("/"))),
|
||||
children=from_dict(children)
|
||||
) for k, children in d.items())
|
||||
|
||||
return Tree.make(key = "root",
|
||||
values=Enum(("root",)),
|
||||
children = from_dict(d))
|
||||
|
||||
@classmethod
|
||||
def empty(cls) -> 'Tree':
|
||||
return cls.make("root", Enum(("root",)), [])
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "".join(node_tree_to_string(node=self))
|
||||
|
||||
def html(self, depth = 2, collapse = True) -> HTML:
|
||||
return HTML(node_tree_to_html(self, depth = depth, collapse = collapse))
|
||||
|
||||
def _repr_html_(self) -> str:
|
||||
return node_tree_to_html(self, depth = 2, collapse = True)
|
||||
|
||||
def __getitem__(self, args) -> 'Tree':
|
||||
key, value = args
|
||||
for c in self.children:
|
||||
if c.key == key and value in c.values:
|
||||
data = dataclasses.replace(c.data, values = Enum((value,)))
|
||||
return dataclasses.replace(c, data = data)
|
||||
raise KeyError(f"Key {key} not found in children of {self.key}")
|
||||
|
||||
|
||||
def print(self, depth = None):
|
||||
print("".join(cc for c in self.children for cc in node_tree_to_string(node=c, depth = depth)))
|
||||
|
||||
def transform(self, func: 'Callable[[Tree], Tree | list[Tree]]') -> 'Tree':
|
||||
"""
|
||||
Call a function on every node of the tree, return one or more nodes.
|
||||
If multiple nodes are returned they each get a copy of the (transformed) children of the original node.
|
||||
Any changes to the children of a node will be ignored.
|
||||
"""
|
||||
def transform(node: Tree) -> list[Tree]:
|
||||
children = [cc for c in node.children for cc in transform(c)]
|
||||
new_nodes = func(node)
|
||||
if isinstance(new_nodes, Tree):
|
||||
new_nodes = [new_nodes]
|
||||
|
||||
return [dataclasses.replace(new_node, children = children)
|
||||
for new_node in new_nodes]
|
||||
|
||||
children = tuple(cc for c in self.children for cc in transform(c))
|
||||
return dataclasses.replace(self, children = children)
|
||||
|
||||
def guess_datatypes(self) -> 'Tree':
|
||||
def guess_datatypes(node: Tree) -> list[Tree]:
|
||||
# Try to convert enum values into more structured types
|
||||
children = tuple(cc for c in node.children for cc in guess_datatypes(c))
|
||||
|
||||
if isinstance(node.values, Enum):
|
||||
match node.key:
|
||||
case "time": range_class = TimeRange
|
||||
case "date": range_class = DateRange
|
||||
case _: range_class = None
|
||||
|
||||
if range_class is not None:
|
||||
return [
|
||||
dataclasses.replace(node, values = range, children = children)
|
||||
for range in range_class.from_strings(node.values.values)
|
||||
]
|
||||
return [dataclasses.replace(node, children = children)]
|
||||
|
||||
children = tuple(cc for c in self.children for cc in guess_datatypes(c))
|
||||
return dataclasses.replace(self, children = children)
|
||||
|
||||
|
||||
def select(self, selection : dict[str, str | list[str]], mode: Literal["strict", "relaxed"] = "relaxed") -> 'Tree':
|
||||
# make all values lists
|
||||
selection = {k : v if isinstance(v, list) else [v] for k,v in selection.items()}
|
||||
|
||||
def not_none(xs): return tuple(x for x in xs if x is not None)
|
||||
|
||||
def select(node: Tree) -> Tree | None:
|
||||
# Check if the key is specified in the selection
|
||||
if node.key not in selection:
|
||||
if mode == "strict":
|
||||
return None
|
||||
return dataclasses.replace(node, children = not_none(select(c) for c in node.children))
|
||||
|
||||
# If the key is specified, check if any of the values match
|
||||
values = Enum(tuple(c for c in selection[node.key] if c in node.values))
|
||||
|
||||
if not values:
|
||||
return None
|
||||
|
||||
return dataclasses.replace(node, values = values, children = not_none(select(c) for c in node.children))
|
||||
|
||||
return dataclasses.replace(self, children = not_none(select(c) for c in self.children))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _insert(position: "Tree", identifier : list[tuple[str, list[str]]]):
|
||||
"""
|
||||
This algorithm goes as follows:
|
||||
We're at a particular node in the tree, and we have a list of key-values pairs that we want to insert.
|
||||
We take the first key values pair
|
||||
key, values = identifier.pop(0)
|
||||
|
||||
The general idea is to insert key, values into the current node and use recursion to handle the rest of the identifier.
|
||||
|
||||
We have two sources of values with possible overlap. The values to insert and the values attached to the children of this node.
|
||||
For each value coming from either source we put it in one of three categories:
|
||||
1) Values that exist only in the already existing child. (Coming exclusively from position.children)
|
||||
2) Values that exist in both a child and the new values.
|
||||
3) Values that exist only in the new values.
|
||||
|
||||
|
||||
Thus we add the values to insert to a set, and loop over the children.
|
||||
For each child we partition its values into the three categories.
|
||||
|
||||
For 1) we create a new child node with the key, reduced set of values and the same children.
|
||||
For 2)
|
||||
Create a new child node with the key, and the values in group 2
|
||||
Recurse to compute the children
|
||||
|
||||
Once we have finished looping over children we know all the values left over came exclusively from the new values.
|
||||
So we:
|
||||
Create a new node with these values.
|
||||
Recurse to compute the children
|
||||
|
||||
Finally we return the node with all these new children.
|
||||
"""
|
||||
if not identifier:
|
||||
return position
|
||||
|
||||
key, values = identifier.pop(0)
|
||||
# print(f"Inserting {key}={values} into {position.summary()}")
|
||||
|
||||
# Determine which children have this key
|
||||
possible_children = {c : [] for c in position.children if c.key == key}
|
||||
entirely_new_values = []
|
||||
|
||||
# For each value check it is already in one of the children
|
||||
for v in values:
|
||||
for c in possible_children:
|
||||
if v in c.values:
|
||||
possible_children[c].append(v)
|
||||
break
|
||||
else: # only executed if the loop did not break
|
||||
# If none of the children have this value, add it to the new child pile
|
||||
entirely_new_values.append(v)
|
||||
|
||||
# d = {p.summary() : v for p, v in possible_children.items()}
|
||||
# print(f" {d} new_values={entirely_new_values}")
|
||||
|
||||
new_children = []
|
||||
for c, affected in possible_children.items():
|
||||
if not affected:
|
||||
new_children.append(c)
|
||||
continue
|
||||
|
||||
unaffected = [x for x in c.values if x not in affected]
|
||||
if unaffected:
|
||||
unaffected_node = Tree.make(c.key, Enum(tuple(unaffected)), c.children)
|
||||
new_children.append(unaffected_node) # Add the unaffected part of this child
|
||||
|
||||
if affected: # This check is not technically necessary, but it makes the code more readable
|
||||
new_node = Tree.make(key, Enum(tuple(affected)), [])
|
||||
new_node = Tree._insert(new_node, identifier)
|
||||
new_children.append(new_node) # Add the affected part of this child
|
||||
|
||||
# If there are any values not in any of the existing children, add them as a new child
|
||||
if entirely_new_values:
|
||||
new_node = Tree.make(key, Enum(tuple(entirely_new_values)), [])
|
||||
new_children.append(Tree._insert(new_node, identifier))
|
||||
|
||||
return Tree.make(position.key, position.values, new_children)
|
||||
|
||||
def insert(self, identifier : dict[str, list[str]]) -> 'Tree':
|
||||
insertion = [(k, v) for k, v in identifier.items()]
|
||||
return Tree._insert(self, insertion)
|
||||
|
||||
def to_list_of_cubes(self):
|
||||
def to_list_of_cubes(node: Tree) -> list[list[Tree]]:
|
||||
return [[node] + sub_cube for c in node.children for sub_cube in to_list_of_cubes(c)]
|
||||
|
||||
return to_list_of_cubes(self)
|
||||
|
||||
def info(self):
|
||||
cubes = self.to_list_of_cubes()
|
||||
print(f"Number of distinct paths: {len(cubes)}")
|
116
tree_compresser/python_src/tree_traverser/tree_formatters.py
Normal file
116
tree_compresser/python_src/tree_traverser/tree_formatters.py
Normal file
@ -0,0 +1,116 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable, Protocol, Sequence, runtime_checkable
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class TreeLike(Protocol):
|
||||
@property
|
||||
def children(self) -> Sequence["TreeLike"]: ... # Supports indexing like node.children[i]
|
||||
|
||||
def summary(self, **kwargs) -> str: ...
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class HTML():
|
||||
html: str
|
||||
def _repr_html_(self):
|
||||
return self.html
|
||||
|
||||
def summarize_node(node: TreeLike, collapse = False, **kwargs) -> tuple[str, TreeLike]:
|
||||
"""
|
||||
Extracts a summarized representation of the node while collapsing single-child paths.
|
||||
Returns the summary string and the last node in the chain that has multiple children.
|
||||
"""
|
||||
summaries = []
|
||||
|
||||
while True:
|
||||
summary = node.summary(**kwargs)
|
||||
if len(summary) > 50:
|
||||
summary = summary[:50] + "..."
|
||||
summaries.append(summary)
|
||||
if not collapse:
|
||||
break
|
||||
|
||||
# Move down if there's exactly one child, otherwise stop
|
||||
if len(node.children) != 1:
|
||||
break
|
||||
node = node.children[0]
|
||||
|
||||
return ", ".join(summaries), node
|
||||
|
||||
def node_tree_to_string(node : TreeLike, prefix : str = "", depth = None) -> Iterable[str]:
|
||||
summary, node = summarize_node(node)
|
||||
|
||||
if depth is not None and depth <= 0:
|
||||
yield summary + " - ...\n"
|
||||
return
|
||||
# Special case for nodes with only a single child, this makes the printed representation more compact
|
||||
elif len(node.children) == 1:
|
||||
yield summary + ", "
|
||||
yield from node_tree_to_string(node.children[0], prefix, depth = depth)
|
||||
return
|
||||
else:
|
||||
yield summary + "\n"
|
||||
|
||||
for index, child in enumerate(node.children):
|
||||
connector = "└── " if index == len(node.children) - 1 else "├── "
|
||||
yield prefix + connector
|
||||
extension = " " if index == len(node.children) - 1 else "│ "
|
||||
yield from node_tree_to_string(child, prefix + extension, depth = depth - 1 if depth is not None else None)
|
||||
|
||||
def _node_tree_to_html(node : TreeLike, prefix : str = "", depth = 1, connector = "", **kwargs) -> Iterable[str]:
|
||||
summary, node = summarize_node(node, **kwargs)
|
||||
|
||||
if len(node.children) == 0:
|
||||
yield f'<span class="leaf">{connector}{summary}</span>'
|
||||
return
|
||||
else:
|
||||
open = "open" if depth > 0 else ""
|
||||
yield f"<details {open}><summary>{connector}{summary}</summary>"
|
||||
|
||||
for index, child in enumerate(node.children):
|
||||
connector = "└── " if index == len(node.children) - 1 else "├── "
|
||||
extension = " " if index == len(node.children) - 1 else "│ "
|
||||
yield from _node_tree_to_html(child, prefix + extension, depth = depth - 1, connector = prefix+connector, **kwargs)
|
||||
yield "</details>"
|
||||
|
||||
def node_tree_to_html(node : TreeLike, depth = 1, **kwargs) -> str:
|
||||
css = """
|
||||
<style>
|
||||
.qubed-tree-view {
|
||||
font-family: monospace;
|
||||
white-space: pre;
|
||||
}
|
||||
.qubed-tree-view details {
|
||||
# display: inline;
|
||||
margin-left: 0;
|
||||
}
|
||||
.qubed-tree-view summary {
|
||||
list-style: none;
|
||||
cursor: pointer;
|
||||
text-overflow: ellipsis;
|
||||
overflow: hidden;
|
||||
text-wrap: nowrap;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.qubed-tree-view .leaf {
|
||||
text-overflow: ellipsis;
|
||||
overflow: hidden;
|
||||
text-wrap: nowrap;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.qubed-tree-view summary:hover,span.leaf:hover {
|
||||
background-color: #f0f0f0;
|
||||
}
|
||||
.qubed-tree-view details > summary::after {
|
||||
content: ' ▲';
|
||||
}
|
||||
.qubed-tree-view details:not([open]) > summary::after {
|
||||
content: " ▼";
|
||||
}
|
||||
</style>
|
||||
|
||||
"""
|
||||
nodes = "".join(_node_tree_to_html(node=node, depth=depth, **kwargs))
|
||||
return f"{css}<pre class='qubed-tree-view'>{nodes}</pre>"
|
40
tree_compresser/python_src/tree_traverser/trie.py
Normal file
40
tree_compresser/python_src/tree_traverser/trie.py
Normal file
@ -0,0 +1,40 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
character = str
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class TrieNode():
|
||||
parent: "TrieNode | None"
|
||||
parent_char: character
|
||||
children: dict[character, "TrieNode"] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Trie:
|
||||
root: TrieNode = field(default_factory=lambda: TrieNode(None, ""))
|
||||
reverse_lookup: dict[int, TrieNode] = field(default_factory=dict)
|
||||
|
||||
def insert(self, word: str):
|
||||
node = self.root
|
||||
for char in word:
|
||||
if char not in node.children:
|
||||
new_node = TrieNode(node, char)
|
||||
node.children[char] = new_node
|
||||
|
||||
node = node.children[char]
|
||||
|
||||
n_id = id(node)
|
||||
if n_id not in self.reverse_lookup:
|
||||
self.reverse_lookup[n_id] = node
|
||||
|
||||
return n_id
|
||||
|
||||
def lookup_by_id(self, n_id: int):
|
||||
leaf_node = self.reverse_lookup[n_id]
|
||||
string = []
|
||||
while leaf_node.parent is not None:
|
||||
string.append(leaf_node.parent_char)
|
||||
leaf_node = leaf_node.parent
|
||||
|
||||
return "".join(reversed(string))
|
||||
|
214
tree_compresser/python_src/tree_traverser/value_types.py
Normal file
214
tree_compresser/python_src/tree_traverser/value_types.py
Normal file
@ -0,0 +1,214 @@
|
||||
import dataclasses
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timedelta
|
||||
from typing import Any, Iterable, Literal
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Values(ABC):
|
||||
@abstractmethod
|
||||
def summary(self) -> str:
|
||||
pass
|
||||
@abstractmethod
|
||||
def __len__(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __contains__(self, value: Any) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def from_strings(self, values: Iterable[str]) -> list['Values']:
|
||||
pass
|
||||
|
||||
@dataclass(frozen=True, order=True)
|
||||
class Enum(Values):
|
||||
"""
|
||||
The simplest kind of key value is just a list of strings.
|
||||
summary -> string1/string2/string....
|
||||
"""
|
||||
values: tuple[Any, ...]
|
||||
|
||||
def __post_init__(self):
|
||||
assert isinstance(self.values, tuple)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.values)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.values)
|
||||
def summary(self) -> str:
|
||||
return '/'.join(map(str, sorted(self.values)))
|
||||
def __contains__(self, value: Any) -> bool:
|
||||
return value in self.values
|
||||
def from_strings(self, values: Iterable[str]) -> list['Values']:
|
||||
return [Enum(tuple(values))]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Range(Values, ABC):
|
||||
dtype: str = dataclasses.field(kw_only=True)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DateRange(Range):
|
||||
start: date
|
||||
end: date
|
||||
step: timedelta
|
||||
dtype: Literal["date"] = dataclasses.field(kw_only=True, default="date")
|
||||
|
||||
@classmethod
|
||||
def from_strings(self, values: Iterable[str]) -> list['DateRange']:
|
||||
dates = sorted([datetime.strptime(v, "%Y%m%d") for v in values])
|
||||
if len(dates) < 2:
|
||||
return [DateRange(
|
||||
start=dates[0],
|
||||
end=dates[0],
|
||||
step=timedelta(days=0)
|
||||
)]
|
||||
|
||||
ranges = []
|
||||
current_range, dates = [dates[0],], dates[1:]
|
||||
while len(dates) > 1:
|
||||
if dates[0] - current_range[-1] == timedelta(days=1):
|
||||
current_range.append(dates.pop(0))
|
||||
|
||||
elif len(current_range) == 1:
|
||||
ranges.append(DateRange(
|
||||
start=current_range[0],
|
||||
end=current_range[0],
|
||||
step=timedelta(days=0)
|
||||
))
|
||||
current_range = [dates.pop(0),]
|
||||
|
||||
else:
|
||||
ranges.append(DateRange(
|
||||
start=current_range[0],
|
||||
end=current_range[-1],
|
||||
step=timedelta(days=1)
|
||||
))
|
||||
current_range = [dates.pop(0),]
|
||||
return ranges
|
||||
|
||||
def __contains__(self, value: Any) -> bool:
|
||||
v = datetime.strptime(value, "%Y%m%d").date()
|
||||
return self.start <= v <= self.end and (v - self.start) % self.step == 0
|
||||
|
||||
|
||||
def __len__(self) -> int:
|
||||
return (self.end - self.start) // self.step
|
||||
|
||||
def summary(self) -> str:
|
||||
def fmt(d): return d.strftime("%Y%m%d")
|
||||
if self.step == timedelta(days=0):
|
||||
return f"{fmt(self.start)}"
|
||||
if self.step == timedelta(days=1):
|
||||
return f"{fmt(self.start)}/to/{fmt(self.end)}"
|
||||
|
||||
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step // timedelta(days=1)}"
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TimeRange(Range):
|
||||
start: int
|
||||
end: int
|
||||
step: int
|
||||
dtype: Literal["time"] = dataclasses.field(kw_only=True, default="time")
|
||||
|
||||
@classmethod
|
||||
def from_strings(self, values: Iterable[str]) -> list['TimeRange']:
|
||||
if len(values) == 0: return []
|
||||
|
||||
times = sorted([int(v) for v in values])
|
||||
if len(times) < 2:
|
||||
return [TimeRange(
|
||||
start=times[0],
|
||||
end=times[0],
|
||||
step=100
|
||||
)]
|
||||
|
||||
ranges = []
|
||||
current_range, times = [times[0],], times[1:]
|
||||
while len(times) > 1:
|
||||
if times[0] - current_range[-1] == 1:
|
||||
current_range.append(times.pop(0))
|
||||
|
||||
elif len(current_range) == 1:
|
||||
ranges.append(TimeRange(
|
||||
start=current_range[0],
|
||||
end=current_range[0],
|
||||
step=0
|
||||
))
|
||||
current_range = [times.pop(0),]
|
||||
|
||||
else:
|
||||
ranges.append(TimeRange(
|
||||
start=current_range[0],
|
||||
end=current_range[-1],
|
||||
step=1
|
||||
))
|
||||
current_range = [times.pop(0),]
|
||||
return ranges
|
||||
|
||||
def __len__(self) -> int:
|
||||
return (self.end - self.start) // self.step
|
||||
|
||||
def summary(self) -> str:
|
||||
def fmt(d): return f"{d:04d}"
|
||||
if self.step == 0:
|
||||
return f"{fmt(self.start)}"
|
||||
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
|
||||
|
||||
def __contains__(self, value: Any) -> bool:
|
||||
v = int(value)
|
||||
return self.start <= v <= self.end and (v - self.start) % self.step == 0
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IntRange(Range):
|
||||
start: int
|
||||
end: int
|
||||
step: int
|
||||
dtype: Literal["int"] = dataclasses.field(kw_only=True, default="int")
|
||||
|
||||
def __len__(self) -> int:
|
||||
return (self.end - self.start) // self.step
|
||||
|
||||
def summary(self) -> str:
|
||||
def fmt(d): return d.strftime("%Y%m%d")
|
||||
return f"{fmt(self.start)}/to/{fmt(self.end)}/by/{self.step}"
|
||||
|
||||
def __contains__(self, value: Any) -> bool:
|
||||
v = int(value)
|
||||
return self.start <= v <= self.end and (v - self.start) % self.step == 0
|
||||
|
||||
@classmethod
|
||||
def from_strings(self, values: Iterable[str]) -> list['IntRange']:
|
||||
if len(values) == 0: return []
|
||||
ints = sorted([int(v) for v in values])
|
||||
if len(ints) < 2:
|
||||
return [IntRange(
|
||||
start=ints[0],
|
||||
end=ints[0],
|
||||
step=0
|
||||
)]
|
||||
|
||||
ranges = []
|
||||
current_range, ints = [ints[0],], ints[1:]
|
||||
while len(ints) > 1:
|
||||
if ints[0] - current_range[-1] == 1:
|
||||
current_range.append(ints.pop(0))
|
||||
|
||||
elif len(current_range) == 1:
|
||||
ranges.append(IntRange(
|
||||
start=current_range[0],
|
||||
end=current_range[0],
|
||||
step=0
|
||||
))
|
||||
current_range = [ints.pop(0),]
|
||||
|
||||
else:
|
||||
ranges.append(IntRange(
|
||||
start=current_range[0],
|
||||
end=current_range[-1],
|
||||
step=1
|
||||
))
|
||||
current_range = [ints.pop(0),]
|
||||
return ranges
|
@ -3,15 +3,15 @@ from pathlib import Path
|
||||
|
||||
from tree_traverser import CompressedTree
|
||||
|
||||
data_path = Path("/home/eouser/qubed/config/climate-dt/compressed_tree.json")
|
||||
data_path = Path("./config/climate-dt/compressed_tree.json")
|
||||
# Print size of file
|
||||
print(f"climate dt compressed tree: {data_path.stat().st_size // 1e6:.1f} MB")
|
||||
|
||||
print("Opening json file")
|
||||
compressed_tree = CompressedTree.load(data_path)
|
||||
|
||||
print(compressed_tree.reconstruct_compressed_ecmwf_style())
|
||||
print(compressed_tree.to_json())
|
||||
|
||||
# print("Outputting compressed tree ecmwf style")
|
||||
# with open("data/compressed_tree_climate_dt_ecmwf_style.json", "w") as f:
|
||||
# json.dump(compressed_tree.reconstruct_compressed_ecmwf_style(), f)
|
||||
print("Outputting compressed tree ecmwf style")
|
||||
with open("config/climate-dt/new_format.json", "w") as f:
|
||||
json.dump(compressed_tree.to_json(), f)
|
||||
|
Loading…
x
Reference in New Issue
Block a user