diff --git a/background.md b/background.md new file mode 100644 index 0000000..dd1cdab --- /dev/null +++ b/background.md @@ -0,0 +1,113 @@ +# WIP +# Datacubes, Trees and Compressed trees + +This first part is essentially a abridged version of the [datacube spec](https://github.com/ecmwf/datacube-spec), see that document for more detail and the canonical source of truth on the matter. + +Qubed is primarily geared towards dealing with datafiles uniquely labeled by sets of key value pairs. We'll call a set of key value pairs that uniquely labels some data an `identifier`. Here's an example: + +```python +{'class': 'd1', + 'dataset': 'climate-dt', + 'generation': '1', + 'date': '20241102', + 'resolution': 'high', + 'time': '0000', +} +``` + +Unfortunately, we have more than one data file. If we are lucky, the set of identifiers that current exists might form a dense datacube that we could represent like this: + +```python +{'class': ['d1', 'd2'], + 'dataset': 'climate-dt', + 'generation': ['1','2','3'], + 'model': 'icon', + 'date': ['20241102','20241103'], + 'resolution': ['high','low'], + 'time': ['0000', '0600', '1200', '1800'], +} +``` + +with the property that any particular choice for a value for any key will correspond to datafile that exists. + +To save space I will also represent this same thing like this: +``` +- class=d1/d2, dataset=climate-dt, generation=1/2/3, model=icon, date=20241102/20241103, resolution=high/low, time=0000/0600/1200/1800 +``` + +Unfortunately, we are not lucky and our datacubes are not always dense. In this case we might instead represent which data exists using a tree: +``` +root +├── class=od +│ ├── expver=0001 +│ │ ├── param=1 +│ │ └── param=2 +│ └── expver=0002 +│ ├── param=1 +│ └── param=2 +└── class=rd + ├── expver=0001 + │ ├── param=1 + │ ├── param=2 + │ └── param=3 + └── expver=0002 + ├── param=1 + └── param=2 +``` + +But it's clear that the above tree contains a lot of redundant information. Many of the subtrees are identical for example. Indeed in practice a lot of our data turns out to be 'nearly dense' in that it contains many dense datacubes within it. + +There are many valid ways one could compress this tree. If we add the restriction that no identical key=value pairs can be adjacent then here is the compressed tree we might get: + +``` +root +├── class=rd +│ ├── expver=0001, param=1/2/3 +│ └── expver=0002, param=1/2 +└── class=od, expver=0001/0002, param=1/2 +``` + +Without the above restriction we could instead have: + +``` +root +├── class=rd +│ ├── expver=0001, param=3 +│ └── expver=0001/0002, param=1/2 +└── class=od, expver=0001/0002, param=1/2 +``` + +but we do not allow this because it would mean we would have to take multiple branches in order to find data with `expver=0001`. + +What we have now is a tree of dense datacubes which represents a single larger sparse datacube in a more compact manner. For want of a better word we'll call it a Qube. + +## API + +Qubed will provide a core compressed tree data structure called a Qube with: + +Methods to convert to and from: + - [x] A human readable representation like those seen above. + - [x] An HTML version where subtrees can be collapsed. + - [ ] An compact protobuf-based binary format + - [x] Nested python dictionaries or JSON + - [/] The output of [fdb list](https://confluence.ecmwf.int/display/FDB/fdb-list) + - [ ] [mars list][mars list] + - [ ] [constraints.json][constraints] + +[constraints]: (https://object-store.os-api.cci2.ecmwf.int/cci2-prod-catalogue/resources/reanalysis-era5-land/constraints_a0ae5b42d67869674e13fba9fd055640bcffc37c24578be1f465d7d5ab2c7ee5.json +[mars list]: https://git.ecmwf.int/projects/CDS/repos/cads-forms-reanalysis/browse/reanalysis-era5-single-levels/gecko-config/mars.list?at=refs%2Fheads%2Fprod + +Useful algorithms: + - [x] Compression + - [/] Union/Intersection/Difference + +Performant Membership Queries + - Identifier membership + - Datacube query (selection) + +Metadata Storage + + + + + diff --git a/tree_compresser/python_src/tree_traverser/CompressedDataCubeTree.py b/tree_compresser/python_src/tree_traverser/CompressedDataCubeTree.py index 82bc869..c4b4a9e 100644 --- a/tree_compresser/python_src/tree_traverser/CompressedDataCubeTree.py +++ b/tree_compresser/python_src/tree_traverser/CompressedDataCubeTree.py @@ -112,8 +112,10 @@ class CompressedTree: root = cache_tree(tree) return cls(cache = cache, root = cache[root]) - def __str__(self): - return "".join(node_tree_to_string(self.root)) + def __str__(self, depth=None) -> str: + return "".join(node_tree_to_string(self.root, depth = depth)) + + def print(self, depth = None): print(self.__str__(depth = depth)) def html(self, depth = 2, debug = False) -> HTML: return HTML(node_tree_to_html(self.root, depth = depth, debug = debug)) diff --git a/tree_compresser/python_src/tree_traverser/DataCubeTree.py b/tree_compresser/python_src/tree_traverser/DataCubeTree.py index 9e51802..6db97d2 100644 --- a/tree_compresser/python_src/tree_traverser/DataCubeTree.py +++ b/tree_compresser/python_src/tree_traverser/DataCubeTree.py @@ -89,14 +89,17 @@ class Tree: return cls.make("root", Enum(("root",)), []) - def __str__(self): - return "".join(node_tree_to_string(node=self)) + def __str__(self, depth = None) -> str: + return "".join(node_tree_to_string(node=self, depth = depth)) + + def print(self, depth = None): print(self.__str__(depth = depth)) def html(self, depth = 2, collapse = True) -> HTML: return HTML(node_tree_to_html(self, depth = depth, collapse = collapse)) def _repr_html_(self) -> str: return node_tree_to_html(self, depth = 2, collapse = True) + def __getitem__(self, args) -> 'Tree': key, value = args @@ -107,8 +110,6 @@ class Tree: raise KeyError(f"Key {key} not found in children of {self.key}") - def print(self, depth = None): - print("".join(cc for c in self.children for cc in node_tree_to_string(node=c, depth = depth))) def transform(self, func: 'Callable[[Tree], Tree | list[Tree]]') -> 'Tree': """ @@ -206,49 +207,44 @@ class Tree: Finally we return the node with all these new children. """ - if not identifier: - return position + pass + # if not identifier: + # return position - key, values = identifier.pop(0) - # print(f"Inserting {key}={values} into {position.summary()}") + # key, values = identifier.pop(0) + # # print(f"Inserting {key}={values} into {position.summary()}") - # Determine which children have this key - possible_children = {c : [] for c in position.children if c.key == key} - entirely_new_values = [] + # # Only the children with the matching key are relevant. + # source_children = {c : [] for c in position.children if c.key == key} + # new_children = [] - # For each value check it is already in one of the children - for v in values: - for c in possible_children: - if v in c.values: - possible_children[c].append(v) - break - else: # only executed if the loop did not break - # If none of the children have this value, add it to the new child pile - entirely_new_values.append(v) + # values = set(values) + # for c in source_children: + # values_set = set(c.values) + # group_1 = values_set - values + # group_2 = values_set & values + # values = values - values_set # At the end of this loop values will contain only the new values - # d = {p.summary() : v for p, v in possible_children.items()} - # print(f" {d} new_values={entirely_new_values}") + # if group_1: + # group_1_node = Tree.make(c.key, Enum(tuple(group_1)), c.children) + # new_children.append(group_1_node) # Add the unaffected part of this child + + # if group_2: + # new_node = Tree.make(key, Enum(tuple(affected)), []) + # new_node = Tree._insert(new_node, identifier) + # new_children.append(new_node) # Add the affected part of this child - new_children = [] - for c, affected in possible_children.items(): - if not affected: - new_children.append(c) - continue - unaffected = [x for x in c.values if x not in affected] - if unaffected: - unaffected_node = Tree.make(c.key, Enum(tuple(unaffected)), c.children) - new_children.append(unaffected_node) # Add the unaffected part of this child + # unaffected = [x for x in c.values if x not in affected] - if affected: # This check is not technically necessary, but it makes the code more readable - new_node = Tree.make(key, Enum(tuple(affected)), []) - new_node = Tree._insert(new_node, identifier) - new_children.append(new_node) # Add the affected part of this child - # If there are any values not in any of the existing children, add them as a new child - if entirely_new_values: - new_node = Tree.make(key, Enum(tuple(entirely_new_values)), []) - new_children.append(Tree._insert(new_node, identifier)) + # if affected: # This check is not technically necessary, but it makes the code more readable + + + # # If there are any values not in any of the existing children, add them as a new child + # if entirely_new_values: + # new_node = Tree.make(key, Enum(tuple(entirely_new_values)), []) + # new_children.append(Tree._insert(new_node, identifier)) return Tree.make(position.key, position.values, new_children) diff --git a/tree_compresser/python_src/tree_traverser/__main__.py b/tree_compresser/python_src/tree_traverser/__main__.py new file mode 100644 index 0000000..b28bafe --- /dev/null +++ b/tree_compresser/python_src/tree_traverser/__main__.py @@ -0,0 +1,41 @@ +import argparse + +# A simple command line app that reads from standard input and writes to standard output +# Arguments: +# --input_format=fdb/mars +# --output_format=text/html +import sys + + +def main(): + parser = argparse.ArgumentParser(description="Generate a compressed tree from various inputs.") + + parser.add_argument( + "--input_format", + choices=["fdb", "mars"], + default="fdb", + help="Specify the input format (fdb list or mars)." + ) + + parser.add_argument( + "--output_format", + choices=["text", "html"], + default="text", + help="Specify the output format (text or html)." + ) + + args = parser.parse_args() + + # Read from standard input + l = 0 + for line in sys.stdin.readlines(): + + + # Process data (For now, just echoing the input) + output_data = f"[Input Format: {args.input_format}] [Output Format: {args.output_format}]\n{l} lines read from standard input\n" + + # Write to standard output + sys.stdout.write(output_data) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tree_compresser/tests/data/fdb_list_compact.gz b/tree_compresser/tests/data/fdb_list_compact.gz new file mode 100644 index 0000000..7337c7d Binary files /dev/null and b/tree_compresser/tests/data/fdb_list_compact.gz differ diff --git a/tree_compresser/tests/data/fdb_list_porcelain.gz b/tree_compresser/tests/data/fdb_list_porcelain.gz new file mode 100644 index 0000000..a664597 Binary files /dev/null and b/tree_compresser/tests/data/fdb_list_porcelain.gz differ diff --git a/tree_compresser/tests/data/mars_list.gz b/tree_compresser/tests/data/mars_list.gz new file mode 100644 index 0000000..b5d8255 Binary files /dev/null and b/tree_compresser/tests/data/mars_list.gz differ