add remove_by_key and improve compression

This commit is contained in:
Tom 2025-04-02 17:51:20 +01:00
parent 2e36db4268
commit 70b1fd65e5
4 changed files with 122 additions and 29 deletions

View File

@ -1,4 +1,5 @@
import dataclasses
import functools
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass
@ -122,6 +123,13 @@ class Qube:
return Qube.root_node(list(from_dict(d)))
def to_dict(self) -> dict:
def to_dict(q: "Qube") -> tuple[str, dict]:
key = f"{q.key}={','.join(str(v) for v in q.values.values)}"
return key, dict(to_dict(c) for c in q.children)
return to_dict(self)[1]
@classmethod
def from_tree(cls, tree_str):
lines = tree_str.splitlines()
@ -283,17 +291,20 @@ class Qube:
else:
yield leaf, metadata
def datacubes(self) -> "Qube":
def to_list_of_cubes(node: Qube) -> Iterable[Qube]:
if not node.children:
yield node
# print(node.key)
for c in node.children:
# print(c)
for sub_cube in to_list_of_cubes(c):
yield node.replace(children=[sub_cube])
def datacubes(self) -> Iterable[dict[str, Any | list[Any]]]:
def to_list_of_cubes(node: Qube) -> Iterable[dict[str, Any | list[Any]]]:
if node.key == "root":
for c in node.children:
yield from to_list_of_cubes(c)
return Qube.root_node((q for c in self.children for q in to_list_of_cubes(c)))
if not node.children:
yield {node.key: list(node.values.values)}
for c in node.children:
for sub_cube in to_list_of_cubes(c):
yield {node.key: list(node.values.values)} | sub_cube
return to_list_of_cubes(self)
def __getitem__(self, args) -> "Qube":
if isinstance(args, str):
@ -354,6 +365,22 @@ class Qube:
children = tuple(cc for c in self.children for cc in transform(c))
return self.replace(children=children)
def remove_by_key(self, keys: str | list[str]):
_keys: list[str] = keys if isinstance(keys, list) else [keys]
def remove_key(node: "Qube") -> "Qube":
children = []
for c in node.children:
if c.key in _keys:
grandchildren = tuple(sorted(remove_key(cc) for cc in c.children))
children.extend(grandchildren)
else:
children.append(remove_key(c))
return node.replace(children=tuple(sorted(children)))
return remove_key(self).compress()
def convert_dtypes(self, converters: dict[str, Callable[[Any], Any]]):
def convert(node: Qube) -> Qube:
if node.key in converters:
@ -474,11 +501,25 @@ class Qube:
return hash_node(self)
def compress(self) -> "Qube":
# First compress the children (this recursively compresses all the way to the leaves)
new_children = [child.compress() for child in self.children]
"""
This method is quite computationally heavy because of trees like this:
root, class=d1, generation=1
time=0600, many identical keys, param=8,78,79
time=0600, many identical keys, param=8,78,79
time=0600, many identical keys, param=8,78,79
This tree compresses dow n
# Now compress the set of children at this level
new_children = set_operations.compress_children(new_children)
"""
# Return the now compressed node
return Qube.make(self.key, self.values, new_children)
def union(a: "Qube", b: "Qube") -> "Qube":
b = type(self).root_node(children=(b,))
out = set_operations.operation(
a, b, set_operations.SetOperation.UNION, type(self)
)
return out
new_children = [c.compress() for c in self.children]
if len(new_children) > 1:
new_children = functools.reduce(union, new_children, Qube.empty()).children
return self.replace(children=tuple(sorted(new_children)))

View File

@ -168,3 +168,12 @@ def compress_children(children: Iterable["Qube"]) -> tuple["Qube"]:
new_children.append(new_child)
return tuple(sorted(new_children, key=lambda n: ((n.key, n.values.min()))))
def union(a: "Qube", b: "Qube") -> "Qube":
return operation(
a,
b,
SetOperation.UNION,
type(a),
)

View File

@ -34,7 +34,7 @@ def summarize_node(
while True:
summary = node.summary(**kwargs)
if "is_leaf" in node.metadata and node.metadata["is_leaf"]:
summary += "🌿"
summary += " 🌿"
paths.append(summary)
if len(summary) > max_summary_length:
summary = summary[:max_summary_length] + "..."

View File

@ -15,19 +15,62 @@ def test_smoke():
}
)
# root
# ├── class=od, expver=0001/0002, param=1/2
# └── class=rd
# ├── expver=0001, param=1/2/3
# └── expver=0002, param=1/2
ct = Qube.from_dict(
ct = Qube.from_tree("""
root
class=od, expver=0001/0002, param=1/2
class=rd
expver=0001, param=1/2/3
expver=0002, param=1/2
""")
assert q.compress() == ct
def test_2():
qube = Qube.from_dict(
{
"class=od": {"expver=0001/0002": {"param=1/2": {}}},
"class=rd": {
"expver=0001": {"param=1/2/3": {}},
"expver=0002": {"param=1/2": {}},
},
"class=d1": {
"generation=1": {
"date=20240728": {"time=0600": {"param=8/78/79": {}}},
"date=20240828": {"time=0600": {"param=8/78/79": {}}},
"date=20240928": {"time=0600": {"param=8/78/79": {}}},
}
}
}
)
assert q.compress() == ct
target = Qube.from_datacube(
{
"class": "d1",
"generation": "1",
"date": ["20240728", "20240828", "20240928"],
"time": "0600",
"param": ["8", "78", "79"],
}
)
assert qube.compress() == target
def test_removal_compression():
qube = Qube.from_dict(
{
"class=d1": {
"generation=1": {
"month=07": {"date=20240728": {"time=0600": {"param=8/78/79": {}}}},
"month=08": {"date=20240828": {"time=0600": {"param=8/78/79": {}}}},
"month=09": {"date=20240928": {"time=0600": {"param=8/78/79": {}}}},
}
}
}
)
target = Qube.from_datacube(
{
"class": "d1",
"generation": "1",
"date": ["20240728", "20240828", "20240928"],
"time": "0600",
"param": ["8", "78", "79"],
}
)
assert qube.remove_by_key(["month"]) == target