From 70b1fd65e599dd599e61610c1e152836a672e3bb Mon Sep 17 00:00:00 2001 From: Tom Date: Wed, 2 Apr 2025 17:51:20 +0100 Subject: [PATCH] add remove_by_key and improve compression --- src/python/qubed/Qube.py | 73 ++++++++++++++++++++++------- src/python/qubed/set_operations.py | 9 ++++ src/python/qubed/tree_formatters.py | 2 +- tests/test_compression.py | 67 +++++++++++++++++++++----- 4 files changed, 122 insertions(+), 29 deletions(-) diff --git a/src/python/qubed/Qube.py b/src/python/qubed/Qube.py index 2c0584b..df8d415 100644 --- a/src/python/qubed/Qube.py +++ b/src/python/qubed/Qube.py @@ -1,4 +1,5 @@ import dataclasses +import functools from collections import defaultdict from collections.abc import Callable from dataclasses import dataclass @@ -122,6 +123,13 @@ class Qube: return Qube.root_node(list(from_dict(d))) + def to_dict(self) -> dict: + def to_dict(q: "Qube") -> tuple[str, dict]: + key = f"{q.key}={','.join(str(v) for v in q.values.values)}" + return key, dict(to_dict(c) for c in q.children) + + return to_dict(self)[1] + @classmethod def from_tree(cls, tree_str): lines = tree_str.splitlines() @@ -283,17 +291,20 @@ class Qube: else: yield leaf, metadata - def datacubes(self) -> "Qube": - def to_list_of_cubes(node: Qube) -> Iterable[Qube]: - if not node.children: - yield node - # print(node.key) - for c in node.children: - # print(c) - for sub_cube in to_list_of_cubes(c): - yield node.replace(children=[sub_cube]) + def datacubes(self) -> Iterable[dict[str, Any | list[Any]]]: + def to_list_of_cubes(node: Qube) -> Iterable[dict[str, Any | list[Any]]]: + if node.key == "root": + for c in node.children: + yield from to_list_of_cubes(c) - return Qube.root_node((q for c in self.children for q in to_list_of_cubes(c))) + if not node.children: + yield {node.key: list(node.values.values)} + + for c in node.children: + for sub_cube in to_list_of_cubes(c): + yield {node.key: list(node.values.values)} | sub_cube + + return to_list_of_cubes(self) def __getitem__(self, args) -> "Qube": if isinstance(args, str): @@ -354,6 +365,22 @@ class Qube: children = tuple(cc for c in self.children for cc in transform(c)) return self.replace(children=children) + def remove_by_key(self, keys: str | list[str]): + _keys: list[str] = keys if isinstance(keys, list) else [keys] + + def remove_key(node: "Qube") -> "Qube": + children = [] + for c in node.children: + if c.key in _keys: + grandchildren = tuple(sorted(remove_key(cc) for cc in c.children)) + children.extend(grandchildren) + else: + children.append(remove_key(c)) + + return node.replace(children=tuple(sorted(children))) + + return remove_key(self).compress() + def convert_dtypes(self, converters: dict[str, Callable[[Any], Any]]): def convert(node: Qube) -> Qube: if node.key in converters: @@ -474,11 +501,25 @@ class Qube: return hash_node(self) def compress(self) -> "Qube": - # First compress the children (this recursively compresses all the way to the leaves) - new_children = [child.compress() for child in self.children] + """ + This method is quite computationally heavy because of trees like this: + root, class=d1, generation=1 + ├── time=0600, many identical keys, param=8,78,79 + ├── time=0600, many identical keys, param=8,78,79 + └── time=0600, many identical keys, param=8,78,79 + This tree compresses dow n - # Now compress the set of children at this level - new_children = set_operations.compress_children(new_children) + """ - # Return the now compressed node - return Qube.make(self.key, self.values, new_children) + def union(a: "Qube", b: "Qube") -> "Qube": + b = type(self).root_node(children=(b,)) + out = set_operations.operation( + a, b, set_operations.SetOperation.UNION, type(self) + ) + return out + + new_children = [c.compress() for c in self.children] + if len(new_children) > 1: + new_children = functools.reduce(union, new_children, Qube.empty()).children + + return self.replace(children=tuple(sorted(new_children))) diff --git a/src/python/qubed/set_operations.py b/src/python/qubed/set_operations.py index c6357db..82462c7 100644 --- a/src/python/qubed/set_operations.py +++ b/src/python/qubed/set_operations.py @@ -168,3 +168,12 @@ def compress_children(children: Iterable["Qube"]) -> tuple["Qube"]: new_children.append(new_child) return tuple(sorted(new_children, key=lambda n: ((n.key, n.values.min())))) + + +def union(a: "Qube", b: "Qube") -> "Qube": + return operation( + a, + b, + SetOperation.UNION, + type(a), + ) diff --git a/src/python/qubed/tree_formatters.py b/src/python/qubed/tree_formatters.py index 8c42179..43a411a 100644 --- a/src/python/qubed/tree_formatters.py +++ b/src/python/qubed/tree_formatters.py @@ -34,7 +34,7 @@ def summarize_node( while True: summary = node.summary(**kwargs) if "is_leaf" in node.metadata and node.metadata["is_leaf"]: - summary += "🌿" + summary += " 🌿" paths.append(summary) if len(summary) > max_summary_length: summary = summary[:max_summary_length] + "..." diff --git a/tests/test_compression.py b/tests/test_compression.py index 050e836..95b2ec9 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -15,19 +15,62 @@ def test_smoke(): } ) - # root - # ├── class=od, expver=0001/0002, param=1/2 - # └── class=rd - # ├── expver=0001, param=1/2/3 - # └── expver=0002, param=1/2 - ct = Qube.from_dict( + ct = Qube.from_tree(""" + root + ├── class=od, expver=0001/0002, param=1/2 + └── class=rd + ├── expver=0001, param=1/2/3 + └── expver=0002, param=1/2 + """) + + assert q.compress() == ct + + +def test_2(): + qube = Qube.from_dict( { - "class=od": {"expver=0001/0002": {"param=1/2": {}}}, - "class=rd": { - "expver=0001": {"param=1/2/3": {}}, - "expver=0002": {"param=1/2": {}}, - }, + "class=d1": { + "generation=1": { + "date=20240728": {"time=0600": {"param=8/78/79": {}}}, + "date=20240828": {"time=0600": {"param=8/78/79": {}}}, + "date=20240928": {"time=0600": {"param=8/78/79": {}}}, + } + } } ) - assert q.compress() == ct + target = Qube.from_datacube( + { + "class": "d1", + "generation": "1", + "date": ["20240728", "20240828", "20240928"], + "time": "0600", + "param": ["8", "78", "79"], + } + ) + assert qube.compress() == target + + +def test_removal_compression(): + qube = Qube.from_dict( + { + "class=d1": { + "generation=1": { + "month=07": {"date=20240728": {"time=0600": {"param=8/78/79": {}}}}, + "month=08": {"date=20240828": {"time=0600": {"param=8/78/79": {}}}}, + "month=09": {"date=20240928": {"time=0600": {"param=8/78/79": {}}}}, + } + } + } + ) + + target = Qube.from_datacube( + { + "class": "d1", + "generation": "1", + "date": ["20240728", "20240828", "20240928"], + "time": "0600", + "param": ["8", "78", "79"], + } + ) + assert qube.remove_by_key(["month"]) == target