diff --git a/docs/index.md b/docs/index.md index 2c88146..8181d5c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -97,11 +97,6 @@ but we do not allow this because it would mean we would have to take multiple br What we have now is a tree of dense datacubes which represents a single larger sparse datacube in a more compact manner. For want of a better word we'll call it a Qube. -## HTML Output - -```{code-cell} python3 -q.compress().html() -```` ## API diff --git a/docs/quickstart.md b/docs/quickstart.md index 7c624ad..ac0790c 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -74,4 +74,14 @@ A.print(name="A"), B.print(name="B"); A | B ``` +### Command Line Usage + +```bash +fdb list class=rd,expver=0001,... | qubed --from=fdblist --to=text +``` + +`--from` options include: `fdblist`, `json`, `protobuf`, `marslist`, `constraints`. +`--to` options include `text`, `html`, `json`, `datacubes` `constraints`. + +use `--input` and `--output` to specify input and output files respectively. diff --git a/src/python/qubed/Qube.py b/src/python/qubed/Qube.py index 93c2c79..8020a1a 100644 --- a/src/python/qubed/Qube.py +++ b/src/python/qubed/Qube.py @@ -1,44 +1,17 @@ import dataclasses from collections import defaultdict -from dataclasses import dataclass, field +from dataclasses import dataclass from functools import cached_property -from typing import Any, Callable, Hashable, Literal, Mapping +from typing import Any, Callable, Literal from frozendict import frozendict from . import set_operations +from .node_types import NodeData, RootNodeData from .tree_formatters import HTML, node_tree_to_html, node_tree_to_string -from .value_types import DateRange, Enum, IntRange, TimeRange, Values +from .value_types import QEnum, Values, values_from_json -def values_from_json(obj) -> Values: - if isinstance(obj, list): - return Enum(tuple(obj)) - - match obj["dtype"]: - case "date": return DateRange(**obj) - case "time": return TimeRange(**obj) - case "int": return IntRange(**obj) - case _: raise ValueError(f"Unknown dtype {obj['dtype']}") - -# In practice use a frozendict -Metadata = Mapping[str, str | int | float | bool] - -@dataclass(frozen=True, eq=True, order=True) -class NodeData: - key: str - values: Values - metadata: dict[str, tuple[Hashable, ...]] = field(default_factory=frozendict, compare=False) - - def summary(self) -> str: - return f"{self.key}={self.values.summary()}" if self.key != "root" else "root" - -@dataclass(frozen=True, eq=True, order=True) -class RootNodeData(NodeData): - "Helper class to print a custom root name" - def summary(self) -> str: - return self.key - @dataclass(frozen=True, eq=True, order=True) class Qube: data: NodeData @@ -85,17 +58,17 @@ class Qube: def from_dict(d: dict) -> tuple[Qube, ...]: return tuple(Qube.make( key=k.split("=")[0], - values=Enum(tuple(k.split("=")[1].split("/"))), + values=QEnum((k.split("=")[1].split("/"))), children=from_dict(children) ) for k, children in d.items()) return Qube.make(key = "root", - values=Enum(("root",)), + values=QEnum(("root",)), children = from_dict(d)) @classmethod def empty(cls) -> 'Qube': - return cls.make("root", Enum(("root",)), []) + return cls.make("root", QEnum(("root",)), []) def __str__(self, depth = None, name = None) -> str: @@ -119,7 +92,7 @@ class Qube: key, value = args for c in self.children: if c.key == key and value in c.values: - data = dataclasses.replace(c.data, values = Enum((value,))) + data = dataclasses.replace(c.data, values = QEnum((value,))) return dataclasses.replace(c, data = data) raise KeyError(f"Key {key} not found in children of {self.key}") @@ -164,7 +137,7 @@ class Qube: return dataclasses.replace(node, children = not_none(select(c) for c in node.children)) # If the key is specified, check if any of the values match - values = Enum(tuple(c for c in selection[node.key] if c in node.values)) + values = QEnum((c for c in selection[node.key] if c in node.values)) if not values: return None @@ -225,11 +198,11 @@ class Qube: # values = values - values_set # At the end of this loop values will contain only the new values # if group_1: - # group_1_node = Qube.make(c.key, Enum(tuple(group_1)), c.children) + # group_1_node = Qube.make(c.key, QEnum((group_1)), c.children) # new_children.append(group_1_node) # Add the unaffected part of this child # if group_2: - # new_node = Qube.make(key, Enum(tuple(affected)), []) + # new_node = Qube.make(key, QEnum((affected)), []) # new_node = Qube._insert(new_node, identifier) # new_children.append(new_node) # Add the affected part of this child @@ -242,7 +215,7 @@ class Qube: # # If there are any values not in any of the existing children, add them as a new child # if entirely_new_values: - # new_node = Qube.make(key, Enum(tuple(entirely_new_values)), []) + # new_node = Qube.make(key, QEnum((entirely_new_values)), []) # new_children.append(Qube._insert(new_node, identifier)) return Qube.make(position.key, position.values, new_children) @@ -292,12 +265,12 @@ class Qube: key = child_set[0].key # Compress the children into a single node - assert all(isinstance(child.data.values, Enum) for child in child_set), "All children must have Enum values" + assert all(isinstance(child.data.values, QEnum) for child in child_set), "All children must have QEnum values" node_data = NodeData( key = key, metadata = frozendict(), # Todo: Implement metadata compression - values = Enum(tuple(v for child in child_set for v in child.data.values.values)), + values = QEnum((v for child in child_set for v in child.data.values.values)), ) new_child = Qube(data = node_data, children = child_set[0].children) else: diff --git a/src/python/qubed/node_types.py b/src/python/qubed/node_types.py new file mode 100644 index 0000000..7abbe75 --- /dev/null +++ b/src/python/qubed/node_types.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass, field +from typing import Hashable + +from frozendict import frozendict + +from .value_types import Values + + +@dataclass(frozen=True, eq=True, order=True) +class NodeData: + key: str + values: Values + metadata: dict[str, tuple[Hashable, ...]] = field(default_factory=frozendict, compare=False) + + def summary(self) -> str: + return f"{self.key}={self.values.summary()}" if self.key != "root" else "root" + +@dataclass(frozen=True, eq=True, order=True) +class RootNodeData(NodeData): + "Helper class to print a custom root name" + def summary(self) -> str: + return self.key \ No newline at end of file diff --git a/src/python/qubed/set_operations.py b/src/python/qubed/set_operations.py index 74d21fb..1699ae7 100644 --- a/src/python/qubed/set_operations.py +++ b/src/python/qubed/set_operations.py @@ -1,5 +1,15 @@ -from enum import Enum +import dataclasses from collections import defaultdict +from enum import Enum + +# Prevent circular imports while allowing the type checker to know what Qube is +from typing import TYPE_CHECKING, Iterable + +from .node_types import NodeData +from .value_types import QEnum, Values + +if TYPE_CHECKING: + from .qube import Qube class SetOperation(Enum): @@ -8,14 +18,58 @@ class SetOperation(Enum): DIFFERENCE = (1, 0, 0) SYMMETRIC_DIFFERENCE = (1, 0, 1) +def fused_set_operations(A: "Values", B: "Values") -> tuple[list[Values], list[Values], list[Values]]: + if isinstance(A, QEnum) and isinstance(B, QEnum): + set_A, set_B = set(A), set(B) + intersection = set_A & set_B + just_A = set_A - intersection + just_B = set_B - intersection + return [QEnum(just_A),], [QEnum(intersection),], [QEnum(just_B),] + + + raise NotImplementedError("Fused set operations on values types other than QEnum are not yet implemented") -def operation(A: "Qube", B : "Qube", type: SetOperation) -> "Qube": - # Sort nodes from both qubes by their keys - nodes_by_key = defaultdict(lambda : dict(A = [], B = [])) - for node in A.nodes: - nodes_by_key[node.key]["A"].append(node) - for key, ndoes +def operation(A: "Qube", B : "Qube", operation_type: SetOperation) -> "Qube": + assert A.key == B.key, "The two Qube root nodes must have the same key to perform set operations," \ + f"would usually be two root nodes. They have {A.key} and {B.key} respectively" + + assert A.values == B.values, f"The two Qube root nodes must have the same values to perform set operations {A.values = }, {B.values = }" + + # Group the children of the two nodes by key + nodes_by_key = defaultdict(lambda : ([], [])) + for node in A.children: + nodes_by_key[node.key][0].append(node) + for node in B.children: + nodes_by_key[node.key][1].append(node) + + new_children = [] + + # For every node group, perform the set operation + for key, (A_nodes, B_nodes) in nodes_by_key.items(): + new_children.extend(_operation(key, A_nodes, B_nodes, operation_type)) + + # The values and key are the same so we just replace the children + return dataclasses.replace(A, children=new_children) + # The root node is special so we need a helper method that we can recurse on -def _operation(A: list["Qube"], B : list["Qube"], type: SetOperation) -> "Qube": - pass \ No newline at end of file +def _operation(key: str, A: list["Qube"], B : list["Qube"], operation_type: SetOperation) -> Iterable["Qube"]: + for node_a in A: + for node_b in B: + just_A, intersection, just_B = fused_set_operations( + node_a.values, + node_b.values + ) + for values in just_A: + data = NodeData(key, values, {}) + yield type(node_a)(data, node_a.children) + + if intersection: + intersected_children = operation(node_a, node_b, operation_type) + for values in intersection: + data = NodeData(key, values, {}) + yield type(node_a)(data, intersected_children) + + for values in just_B: + data = NodeData(key, values, {}) + yield type(node_a)(data, node_b.children) \ No newline at end of file diff --git a/src/python/qubed/tree_formatters.py b/src/python/qubed/tree_formatters.py index b8782a7..f99429e 100644 --- a/src/python/qubed/tree_formatters.py +++ b/src/python/qubed/tree_formatters.py @@ -1,3 +1,4 @@ +import random from dataclasses import dataclass from typing import Iterable, Protocol, Sequence, runtime_checkable @@ -74,46 +75,56 @@ def _node_tree_to_html(node : TreeLike, prefix : str = "", depth = 1, connector yield "" def node_tree_to_html(node : TreeLike, depth = 1, **kwargs) -> str: - css = """ + css_id = f"qubed-tree-{random.randint(0, 1000000)}" + css = f""" - """ nodes = "".join(_node_tree_to_html(node=node, depth=depth, **kwargs)) - return f"{css}
{nodes}" \ No newline at end of file + return f"{css}
{nodes}" \ No newline at end of file diff --git a/src/python/qubed/value_types.py b/src/python/qubed/value_types.py index bae29f7..6557514 100644 --- a/src/python/qubed/value_types.py +++ b/src/python/qubed/value_types.py @@ -2,7 +2,7 @@ import dataclasses from abc import ABC, abstractmethod from dataclasses import dataclass from datetime import date, datetime, timedelta -from typing import Any, Iterable, Literal +from typing import Any, FrozenSet, Iterable, Literal, TypeVar @dataclass(frozen=True) @@ -22,13 +22,19 @@ class Values(ABC): def from_strings(self, values: Iterable[str]) -> list['Values']: pass +T = TypeVar("T") +EnumValuesType = FrozenSet[T] @dataclass(frozen=True, order=True) -class Enum(Values): + +class QEnum(Values): """ The simplest kind of key value is just a list of strings. summary -> string1/string2/string.... """ - values: tuple[Any, ...] + values: EnumValuesType + + def __init__(self, obj): + object.__setattr__(self, 'values', frozenset(obj)) def __post_init__(self): assert isinstance(self.values, tuple) @@ -43,7 +49,7 @@ class Enum(Values): def __contains__(self, value: Any) -> bool: return value in self.values def from_strings(self, values: Iterable[str]) -> list['Values']: - return [Enum(tuple(values))] + return [type(self)(tuple(values))] @dataclass(frozen=True) class Range(Values, ABC): @@ -115,8 +121,6 @@ class TimeRange(Range): @classmethod def from_strings(self, values: Iterable[str]) -> list['TimeRange']: - if len(values) == 0: return [] - times = sorted([int(v) for v in values]) if len(times) < 2: return [TimeRange( @@ -181,7 +185,6 @@ class IntRange(Range): @classmethod def from_strings(self, values: Iterable[str]) -> list['IntRange']: - if len(values) == 0: return [] ints = sorted([int(v) for v in values]) if len(ints) < 2: return [IntRange( @@ -211,4 +214,14 @@ class IntRange(Range): step=1 )) current_range = [ints.pop(0),] - return ranges \ No newline at end of file + return ranges + +def values_from_json(obj) -> Values: + if isinstance(obj, list): + return QEnum(tuple(obj)) + + match obj["dtype"]: + case "date": return DateRange(**obj) + case "time": return TimeRange(**obj) + case "int": return IntRange(**obj) + case _: raise ValueError(f"Unknown dtype {obj['dtype']}") diff --git a/tests/test_basic_operations.py b/tests/test_basic_operations.py index dd2883f..82c5ac5 100644 --- a/tests/test_basic_operations.py +++ b/tests/test_basic_operations.py @@ -24,4 +24,17 @@ def test_n_leaves(): }) # Size is 3*3*3 + 1*1*1 = 27 + 1 - assert q.n_leaves == 27 + 1 \ No newline at end of file + assert q.n_leaves == 27 + 1 + + +# def test_union(): +# q = Qube.from_dict({"a=1/2/3" : {"b=1" : {}},}) +# r = Qube.from_dict({"a=2/3/4" : {"b=2" : {}},}) + +# u = Qube.from_dict({ +# "a=1" : {"b=1" : {}}, +# "a=1/2/3" : {"b=1/2" : {}}, +# "a=4" : {"b=2" : {}}, +# }) + +# assert q | r == u \ No newline at end of file