add tree_compresser
This commit is contained in:
parent
50d86c77ec
commit
df8ea6c2f9
13
tree_compresser/pyproject.toml
Normal file
13
tree_compresser/pyproject.toml
Normal file
@ -0,0 +1,13 @@
|
||||
[build-system]
|
||||
requires = ["setuptools >= 61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "TreeTraverser"
|
||||
description = "Tools to work with compressed Datacubes and Trees"
|
||||
dynamic = ["version"]
|
||||
dependencies = [
|
||||
"fastapi",
|
||||
"pe"
|
||||
]
|
||||
|
305
tree_compresser/src/TreeTraverser/CompressedTree.py
Normal file
305
tree_compresser/src/TreeTraverser/CompressedTree.py
Normal file
@ -0,0 +1,305 @@
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from typing import TypeVar
|
||||
from pathlib import Path
|
||||
|
||||
Tree = dict[str, "Tree"]
|
||||
|
||||
class RefcountedDict(dict[str, int]):
|
||||
refcount: int = 1
|
||||
|
||||
def __repr__(self):
|
||||
return f"RefcountedDict(refcount={self.refcount}, {super().__repr__()})"
|
||||
|
||||
def __hash__(self):
|
||||
return hash(tuple(sorted(self.items())))
|
||||
|
||||
|
||||
class CompressedTree():
|
||||
"""
|
||||
A implementation of a compressed tree that supports lookup, insertion, deletion and caching.
|
||||
The caching means that identical subtrees are stored only once, saving memory
|
||||
This is implemented internal by storing all subtrees in a global hash table
|
||||
|
||||
"""
|
||||
cache: dict[int, RefcountedDict]
|
||||
tree: RefcountedDict
|
||||
|
||||
def _add_to_cache(self, level : RefcountedDict) -> int:
|
||||
"Add a level {key -> hash} to the cache"
|
||||
h = hash(level)
|
||||
if h not in self.cache:
|
||||
# Increase refcounts of the child nodes
|
||||
for child_h in level.values():
|
||||
self.cache[child_h].refcount += 1
|
||||
self.cache[h] = RefcountedDict(level)
|
||||
else:
|
||||
self.cache[h].refcount += 1
|
||||
return h
|
||||
|
||||
def _replace_in_cache(self, old_h, level : RefcountedDict) -> int:
|
||||
"""
|
||||
Replace the object at old_h with a different object level
|
||||
If the objects this is a no-op
|
||||
"""
|
||||
# Start by adding the new object to the cache
|
||||
new_h = self._add_to_cache(level)
|
||||
|
||||
# Now check if the old object needs to be garbage collected
|
||||
self._decrease_refcount(old_h)
|
||||
|
||||
return new_h
|
||||
|
||||
def _decrease_refcount(self, h : int):
|
||||
self.cache[h].refcount -= 1
|
||||
if self.cache[h].refcount == 0:
|
||||
# Recursively decrease refcounts of child nodes
|
||||
for child_h in self.cache[h].values():
|
||||
self._decrease_refcount(child_h)
|
||||
del self.cache[h]
|
||||
|
||||
def cache_tree(self, tree : Tree) -> int:
|
||||
"Insert the given tree (dictonary of dictionaries) (all it's children, recursively) into the hash table and return the hash key"
|
||||
level = RefcountedDict({k : self.cache_tree(v) for k, v in tree.items()})
|
||||
return self._add_to_cache(level)
|
||||
|
||||
|
||||
def _cache_path(self, path : list[str]) -> int:
|
||||
"Treat path = [x, y, z...] like {x : {y : {z : ...}}} and cache that"
|
||||
if not path:
|
||||
return self.empty_hash
|
||||
k, *rest = path
|
||||
return self._add_to_cache(RefcountedDict({k : self._cache_path(rest)}))
|
||||
|
||||
def reconstruct(self) -> dict[str, dict]:
|
||||
"Reconstruct the tree as a normal nested dictionary"
|
||||
def reconstruct_node(h : int) -> dict[str, dict]:
|
||||
return {k : reconstruct_node(v) for k, v in self.cache[h].items()}
|
||||
return reconstruct_node(self.root_hash)
|
||||
|
||||
def reconstruct_compressed(self) -> dict[str, dict]:
|
||||
"Reconstruct the tree as a normal nested dictionary"
|
||||
def reconstruct_node(h : int) -> dict[str, dict]:
|
||||
dedup : dict[int, set[str]] = defaultdict(set)
|
||||
for k, h2 in self.cache[h].items():
|
||||
dedup[h2].add(k)
|
||||
|
||||
return {"/".join(keys) : reconstruct_node(h) for h, keys in dedup.items()}
|
||||
return reconstruct_node(self.root_hash)
|
||||
|
||||
def reconstruct_compressed_ecmwf_style(self) -> dict[str, dict]:
|
||||
"Reconstruct the tree as a normal nested dictionary"
|
||||
def reconstruct_node(h : int) -> dict[str, dict]:
|
||||
dedup : dict[tuple[int, str], set[str]] = defaultdict(set)
|
||||
for k, h2 in self.cache[h].items():
|
||||
key, value = k.split("=")
|
||||
dedup[(h2, key)].add(value)
|
||||
|
||||
|
||||
|
||||
return {f"{key}={','.join(values)}" : reconstruct_node(h) for (h, key), values in dedup.items()}
|
||||
return reconstruct_node(self.root_hash)
|
||||
|
||||
def __init__(self, tree : Tree):
|
||||
self.cache = {}
|
||||
self.empty_hash = hash(RefcountedDict({}))
|
||||
|
||||
# Recursively cache the tree
|
||||
self.root_hash = self.cache_tree(tree)
|
||||
|
||||
# Keep a reference to the root of the tree
|
||||
self.tree = self.cache[self.root_hash]
|
||||
|
||||
|
||||
def lookup(self, keys : tuple[str, ...]) -> tuple[bool, tuple[str, ...]]:
|
||||
"""
|
||||
Lookup a subtree in the tree
|
||||
Returns success, path
|
||||
if success == True it means the path got to the bottom of the tree and path will be equal to keys
|
||||
if success == False, path will holds the keys that were found
|
||||
"""
|
||||
loc = self.tree
|
||||
for i, key in enumerate(keys):
|
||||
if key in loc:
|
||||
h = loc[key] # get the hash of the subtree
|
||||
loc = self.cache[h] # get the subtree
|
||||
else:
|
||||
return False, keys[:i]
|
||||
return True, keys
|
||||
|
||||
def keys(self, keys : tuple[str, ...] = ()) -> list[str] | None:
|
||||
loc = self.tree
|
||||
for i, key in enumerate(keys):
|
||||
if key in loc:
|
||||
h = loc[key] # get the hash of the subtree
|
||||
loc = self.cache[h] # get the subtree
|
||||
else:
|
||||
return None
|
||||
return list(loc.keys())
|
||||
|
||||
def multi_match(self, request : dict[str, list[str]], loc = None):
|
||||
if not loc: return {"_END_" : {}}
|
||||
if loc is None: loc = self.tree
|
||||
matches = {}
|
||||
for request_key, request_values in request.items():
|
||||
for request_value in request_values:
|
||||
meta_key = f"{request_key}={request_value}"
|
||||
if meta_key in loc:
|
||||
new_loc = self.cache[loc[meta_key]]
|
||||
matches[meta_key] = self.multi_match(request, new_loc)
|
||||
|
||||
if not matches: return {k : {} for k in loc.items()}
|
||||
return matches
|
||||
|
||||
|
||||
def _insert(self, old_h : int, tree: RefcountedDict, keys : tuple[str, ...]) -> int:
|
||||
"Insert keys in the subtree and return the new hash of the subtree"
|
||||
key, *rest = keys
|
||||
assert old_h in self.cache
|
||||
|
||||
# Adding a new branch to the tree
|
||||
if key not in tree:
|
||||
new_tree = RefcountedDict(tree | {key : self._cache_path(rest)})
|
||||
|
||||
else:
|
||||
# Make a copy of the tree and update the subtree
|
||||
new_tree = RefcountedDict(tree.copy())
|
||||
subtree_h = tree[key]
|
||||
subtree = self.cache[subtree_h]
|
||||
new_tree[key] = self._insert(subtree_h, subtree, tuple(rest))
|
||||
|
||||
# no-op if the hash hasn't changed
|
||||
new_h = self._replace_in_cache(old_h, new_tree)
|
||||
return new_h
|
||||
|
||||
|
||||
def insert(self, keys : tuple[str, ...]):
|
||||
"""
|
||||
Insert a new branch into the compressed tree
|
||||
"""
|
||||
already_there, path = self.lookup(keys)
|
||||
if already_there:
|
||||
return
|
||||
# Update the tree
|
||||
self.root_hash = self._insert(self.root_hash, self.tree, keys)
|
||||
self.tree = self.cache[self.root_hash]
|
||||
|
||||
def insert_tree(self, subtree: Tree):
|
||||
"""
|
||||
Insert a whole tree into the compressed tree.
|
||||
"""
|
||||
self.root_hash = self._insert_tree(self.root_hash, self.tree, subtree)
|
||||
self.tree = self.cache[self.root_hash]
|
||||
|
||||
def _insert_tree(self, old_h: int, tree: RefcountedDict, subtree: Tree) -> int:
|
||||
"""
|
||||
Recursively insert a subtree into the compressed tree and return the new hash.
|
||||
"""
|
||||
assert old_h in self.cache
|
||||
|
||||
# Make a copy of the tree to avoid modifying shared structures
|
||||
new_tree = RefcountedDict(tree.copy())
|
||||
for key, sub_subtree in subtree.items():
|
||||
if key not in tree:
|
||||
# Key is not in current tree, add the subtree
|
||||
# Cache the subtree rooted at sub_subtree
|
||||
subtree_h = self.cache_tree(sub_subtree)
|
||||
new_tree[key] = subtree_h
|
||||
else:
|
||||
# Key is in tree, need to recursively merge
|
||||
# Get the hash and subtree from the current tree
|
||||
child_h = tree[key]
|
||||
child_tree = self.cache[child_h]
|
||||
# Recursively merge
|
||||
new_child_h = self._insert_tree(child_h, child_tree, sub_subtree)
|
||||
new_tree[key] = new_child_h
|
||||
|
||||
# Replace the old hash with the new one in the cache
|
||||
new_h = self._replace_in_cache(old_h, new_tree)
|
||||
return new_h
|
||||
|
||||
def save(self, path : Path):
|
||||
"Save the compressed tree to a file"
|
||||
with open(path, "w") as f:
|
||||
json.dump({
|
||||
"cache" : {k : {"refcount" : v.refcount, "dict" : v} for k, v in self.cache.items()},
|
||||
"root_hash": self.root_hash
|
||||
}, f)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path : Path) -> "CompressedTree":
|
||||
"Load the compressed tree from a file"
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
return cls.from_json(data)
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data : dict) -> "CompressedTree":
|
||||
c = CompressedTree({})
|
||||
c.cache = {}
|
||||
for k, v in data["cache"].items():
|
||||
c.cache[int(k)] = RefcountedDict(v["dict"])
|
||||
c.cache[int(k)].refcount = v["refcount"]
|
||||
|
||||
c.root_hash = data["root_hash"]
|
||||
c.tree = c.cache[c.root_hash]
|
||||
return c
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
original_tree = {
|
||||
"a": {
|
||||
"b1": {
|
||||
"c": {}
|
||||
},
|
||||
"b2" : {
|
||||
"c": {}
|
||||
},
|
||||
"b3*": {
|
||||
"c*": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c_tree = CompressedTree(original_tree)
|
||||
|
||||
assert c_tree.lookup(("a", "b1", "c")) == (True, ("a", "b1", "c"))
|
||||
assert c_tree.lookup(("a", "b1", "d")) == (False, ("a", "b1"))
|
||||
|
||||
print(json.dumps(c_tree.reconstruct_compressed(), indent = 4))
|
||||
|
||||
assert c_tree.reconstruct() == original_tree
|
||||
|
||||
c_tree.insert(("a", "b1", "d"))
|
||||
c_tree.insert(("a", "b2", "d"))
|
||||
print(json.dumps(c_tree.reconstruct(), indent = 4))
|
||||
|
||||
print(json.dumps(c_tree.reconstruct_compressed(), indent = 4))
|
||||
print(c_tree.cache)
|
||||
|
||||
# test round trip
|
||||
assert CompressedTree(original_tree).reconstruct() == original_tree
|
||||
|
||||
# test adding a key
|
||||
added_keys_tree = {
|
||||
"a": {
|
||||
"b1": {
|
||||
"c": {}
|
||||
},
|
||||
"b2" : {
|
||||
"c": {},
|
||||
"d" : {}
|
||||
},
|
||||
"b3*": {
|
||||
"c*": {},
|
||||
"d*": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
c_tree = CompressedTree(original_tree)
|
||||
c_tree.insert(("a", "b2", "d"))
|
||||
c_tree.insert(("a", "b3*", "d*"))
|
||||
assert c_tree.reconstruct() == added_keys_tree
|
||||
|
||||
print(c_tree.reconstruct_compressed())
|
1
tree_compresser/src/TreeTraverser/fdb_schema/__init__.py
Normal file
1
tree_compresser/src/TreeTraverser/fdb_schema/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .fdb_schema_parser import FDBSchema, FDBSchemaFile, KeySpec, Key
|
@ -0,0 +1,375 @@
|
||||
import dataclasses
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import pe
|
||||
from pe.actions import Pack
|
||||
from pe.operators import Class, Star
|
||||
|
||||
from .fdb_types import FDB_type_to_implementation, FDBType
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class KeySpec:
|
||||
"""
|
||||
Represents the specification of a single key in an FDB schema file. For example in
|
||||
```
|
||||
[ class, expver, stream=lwda, date, time, domain?
|
||||
[ type=ofb/mfb/oai
|
||||
[ obsgroup, reportype ]]]
|
||||
```
|
||||
class, expver, type=ofdb/mfb/oai etc are the KeySpecs
|
||||
|
||||
These can have additional information such as: flags like `domain?`, allowed values like `type=ofb/mfb/oai`
|
||||
or specify type information with `date: ClimateMonthly`
|
||||
|
||||
"""
|
||||
|
||||
key: str
|
||||
type: FDBType = field(default_factory=FDBType)
|
||||
flag: str | None = None
|
||||
values: tuple = field(default_factory=tuple)
|
||||
comment: str = ""
|
||||
|
||||
def __repr__(self):
|
||||
repr = self.key
|
||||
if self.flag:
|
||||
repr += self.flag
|
||||
# if self.type:
|
||||
# repr += f":{self.type}"
|
||||
if self.values:
|
||||
repr += "=" + "/".join(self.values)
|
||||
return repr
|
||||
|
||||
def matches(self, key, value):
|
||||
# Sanity check!
|
||||
if self.key != key:
|
||||
return False
|
||||
|
||||
# Some keys have a set of allowed values type=ofb/mfb/oai
|
||||
if self.values:
|
||||
if value not in self.values:
|
||||
return False
|
||||
|
||||
# Check the formatting of values like Time or Date
|
||||
if self.type and not self.type.validate(value):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_optional(self):
|
||||
if self.flag is None:
|
||||
return False
|
||||
return "?" in self.flag
|
||||
|
||||
def is_allable(self):
|
||||
if self.flag is None:
|
||||
return False
|
||||
return "*" in self.flag
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Comment:
|
||||
"Represents a comment node in the schema"
|
||||
|
||||
value: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FDBSchemaTypeDef:
|
||||
"Mapping between FDB schema key names and FDB Schema Types, i.e expver is of type Expver"
|
||||
|
||||
key: str
|
||||
type: str
|
||||
|
||||
|
||||
# This is the schema grammar written in PEG format
|
||||
fdb_schema = pe.compile(
|
||||
r"""
|
||||
FDB < Line+ EOF
|
||||
Line < Schema / Comment / TypeDef / empty
|
||||
|
||||
# Comments
|
||||
Comment <- "#" ~non_eol*
|
||||
non_eol <- [\x09\x20-\x7F] / non_ascii
|
||||
non_ascii <- [\x80-\uD7FF\uE000-\U0010FFFF]
|
||||
|
||||
# Default Type Definitions
|
||||
TypeDef < String ":" String ";"
|
||||
|
||||
# Schemas are the main attraction
|
||||
# They're a tree of KeySpecs.
|
||||
Schema < "[" KeySpecs (","? Schema)* "]"
|
||||
|
||||
# KeySpecs can be just a name i.e expver
|
||||
# Can also have a type expver:int
|
||||
# Or a flag expver?
|
||||
# Or values expver=xxx
|
||||
KeySpecs < KeySpec_ws ("," KeySpec_ws)*
|
||||
KeySpec_ws < KeySpec
|
||||
KeySpec <- key:String (flag:Flag)? (type:Type)? (values:Values)? ([ ]* comment:Comment)?
|
||||
Flag <- ~("?" / "-" / "*")
|
||||
Type <- ":" [ ]* String
|
||||
Values <- "=" Value ("/" Value)*
|
||||
|
||||
# Low level stuff
|
||||
Value <- ~([-a-zA-Z0-9_]+)
|
||||
String <- ~([a-zA-Z0-9_]+)
|
||||
EOF <- !.
|
||||
empty <- ""
|
||||
""",
|
||||
actions={
|
||||
"Schema": Pack(tuple),
|
||||
"KeySpec": KeySpec,
|
||||
"Values": Pack(tuple),
|
||||
"Comment": Comment,
|
||||
"TypeDef": FDBSchemaTypeDef,
|
||||
},
|
||||
ignore=Star(Class("\t\f\r\n ")),
|
||||
# flags=pe.DEBUG,
|
||||
)
|
||||
|
||||
|
||||
def post_process(entries):
|
||||
"Take the raw output from the PEG parser and split it into type definitions and schema entries."
|
||||
typedefs = {}
|
||||
schemas = []
|
||||
for entry in entries:
|
||||
match entry:
|
||||
case c if isinstance(c, Comment):
|
||||
pass
|
||||
case t if isinstance(t, FDBSchemaTypeDef):
|
||||
typedefs[t.key] = t.type
|
||||
case s if isinstance(s, tuple):
|
||||
schemas.append(s)
|
||||
case _:
|
||||
raise ValueError
|
||||
return typedefs, tuple(schemas)
|
||||
|
||||
|
||||
def determine_types(types, node):
|
||||
"Recursively walk a schema tree and insert the type information."
|
||||
if isinstance(node, tuple):
|
||||
return [determine_types(types, n) for n in node]
|
||||
return dataclasses.replace(node, type=types.get(node.key, FDBType()))
|
||||
|
||||
|
||||
@dataclass
|
||||
class Key:
|
||||
key: str
|
||||
value: Any
|
||||
key_spec: KeySpec
|
||||
reason: str
|
||||
|
||||
def str_value(self):
|
||||
return self.key_spec.type.format(self.value)
|
||||
|
||||
def __bool__(self):
|
||||
return self.reason in {"Matches", "Skipped", "Select All"}
|
||||
|
||||
def emoji(self):
|
||||
return {"Matches": "✅", "Skipped": "⏭️", "Select All": "★"}.get(
|
||||
self.reason, "❌"
|
||||
)
|
||||
|
||||
def info(self):
|
||||
return f"{self.emoji()} {self.key:<12}= {str(self.value):<12} ({self.key_spec}) {self.reason if not self else ''}"
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.key}={self.key_spec.type.format(self.value)}"
|
||||
|
||||
def as_json(self):
|
||||
return dict(
|
||||
key=self.key,
|
||||
value=self.str_value(),
|
||||
reason=self.reason,
|
||||
)
|
||||
|
||||
|
||||
class FDBSchema:
|
||||
"""
|
||||
Represents a parsed FDB Schema file.
|
||||
Has methods to validate and convert request dictionaries to a mars request form with validation and type information.
|
||||
"""
|
||||
|
||||
def __init__(self, string, defaults: dict[str, str] = {}):
|
||||
"""
|
||||
1. Use a PEG parser on a schema string,
|
||||
2. Separate the output into schemas and typedefs
|
||||
3. Insert any concrete implementations of types from fdb_types.py defaulting to generic string type
|
||||
4. Walk the schema tree and annotate it with type information.
|
||||
"""
|
||||
m = fdb_schema.match(string)
|
||||
g = list(m.groups())
|
||||
self._str_types, schemas = post_process(g)
|
||||
self.types = {
|
||||
key: FDB_type_to_implementation[type]
|
||||
for key, type in self._str_types.items()
|
||||
}
|
||||
self.schemas = determine_types(self.types, schemas)
|
||||
self.defaults = defaults
|
||||
|
||||
def __repr__(self):
|
||||
return json.dumps(
|
||||
dict(schemas=self.schemas, defaults=self.defaults), indent=4, default=repr
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def consume_key(
|
||||
cls, key_spec: KeySpec, request: dict[str, Any]
|
||||
) -> Key:
|
||||
key = key_spec.key
|
||||
try:
|
||||
value = request[key]
|
||||
except KeyError:
|
||||
if key_spec.is_optional():
|
||||
return Key(key_spec.key, "", key_spec, "Skipped")
|
||||
if key_spec.is_allable():
|
||||
return Key(key_spec.key, "", key_spec, "Select All")
|
||||
else:
|
||||
return Key(
|
||||
key_spec.key, "", key_spec, "Key Missing"
|
||||
)
|
||||
|
||||
if key_spec.matches(key, value):
|
||||
return Key(
|
||||
key_spec.key,
|
||||
key_spec.type.parse(value),
|
||||
key_spec,
|
||||
"Matches",
|
||||
)
|
||||
else:
|
||||
return Key(
|
||||
key_spec.key, value, key_spec, "Incorrect Value"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _DFS_match(
|
||||
cls, tree: list, request: dict[str, Any]
|
||||
) -> tuple[bool | list, list[Key]]:
|
||||
"""Do a DFS on the schema tree, returning the deepest matching path
|
||||
At each stage return whether we matched on this path, and the path itself.
|
||||
|
||||
When traversing the tree there are three cases to consider:
|
||||
1. base case []
|
||||
2. one schema [k, k, k, [k, k, k]]
|
||||
3. list of schemas [[k,k,k], [k,k,k], [k,k,k]]
|
||||
"""
|
||||
# Case 1: Base Case
|
||||
if not tree:
|
||||
return True, []
|
||||
|
||||
# Case 2: [k, k, k, [k, k, k]]
|
||||
if isinstance(tree[0], KeySpec):
|
||||
node, *tree = tree
|
||||
# Check if this node is in the request
|
||||
match_result = cls.consume_key(node, request)
|
||||
|
||||
# If if isn't then terminate this path here
|
||||
if not match_result:
|
||||
return False, [match_result,] # fmt: skip
|
||||
|
||||
# Otherwise continue walking the tree and return the best result
|
||||
matched, path = cls._DFS_match(tree, request)
|
||||
|
||||
# Don't put the key in the path if it's optional and we're skipping it.
|
||||
if match_result.reason != "Skipped":
|
||||
path = [match_result,] + path # fmt: skip
|
||||
|
||||
return matched, path
|
||||
|
||||
# Case 3: [[k, k, k], [k, k, k]]
|
||||
branches = []
|
||||
for branch in tree:
|
||||
matched, branch_path = cls._DFS_match(branch, request)
|
||||
|
||||
# If this branch matches, terminate the DFS and use this.
|
||||
if matched:
|
||||
return branch, branch_path
|
||||
else:
|
||||
branches.append(branch_path)
|
||||
|
||||
# If no branch matches, return the one with the deepest match
|
||||
return False, max(branches, key=len)
|
||||
|
||||
@classmethod
|
||||
def _DFS_match_all(
|
||||
cls, tree: list, request: dict[str, Any]
|
||||
) -> list[list[Key]]:
|
||||
"""Do a DFS on the schema tree, returning all matching paths or partial matches.
|
||||
At each stage return all matching paths and the deepest partial matches.
|
||||
|
||||
When traversing the tree there are three cases to consider:
|
||||
1. base case []
|
||||
2. one schema [k, k, k, [k, k, k]]
|
||||
3. list of schemas [[k,k,k], [k,k,k], [k,k,k]]
|
||||
"""
|
||||
# Case 1: Base Case
|
||||
if not tree:
|
||||
return [[]]
|
||||
|
||||
# Case 2: [k, k, k, [k, k, k]]
|
||||
if isinstance(tree[0], KeySpec):
|
||||
node, *tree = tree
|
||||
# Check if this node is in the request
|
||||
request_values = request.get(node.key, None)
|
||||
|
||||
if request_values is None:
|
||||
# If the key is not in the request, return a partial match with Key Missing
|
||||
return [[Key(node.key, "", node, "Key Missing")]]
|
||||
|
||||
# If the request value is a list, try to match each value
|
||||
if isinstance(request_values, list):
|
||||
all_matches = []
|
||||
for value in request_values:
|
||||
match_result = cls.consume_key(node, {node.key: value})
|
||||
|
||||
if match_result:
|
||||
sub_matches = cls._DFS_match_all(tree, request)
|
||||
for match in sub_matches:
|
||||
if match_result.reason != "Skipped":
|
||||
match.insert(0, match_result)
|
||||
all_matches.append(match)
|
||||
|
||||
return all_matches if all_matches else [[Key(node.key, "", node, "No Match Found")]]
|
||||
else:
|
||||
# Handle a single value
|
||||
match_result = cls.consume_key(node, request)
|
||||
|
||||
# If it isn't then return a partial match with Key Missing
|
||||
if not match_result:
|
||||
return [[Key(node.key, "", node, "Key Missing")]]
|
||||
|
||||
# Continue walking the tree and get all matches
|
||||
all_matches = cls._DFS_match_all(tree, request)
|
||||
|
||||
# Prepend the current match to all further matches
|
||||
for match in all_matches:
|
||||
if match_result.reason != "Skipped":
|
||||
match.insert(0, match_result)
|
||||
|
||||
return all_matches
|
||||
|
||||
# Case 3: [[k, k, k], [k, k, k]]
|
||||
all_branch_matches = []
|
||||
for branch in tree:
|
||||
branch_matches = cls._DFS_match_all(branch, request)
|
||||
all_branch_matches.extend(branch_matches)
|
||||
|
||||
# Return all of the deepest partial matches or complete matches
|
||||
return all_branch_matches
|
||||
|
||||
def match_all(self, request: dict[str, Any]):
|
||||
request = request | self.defaults
|
||||
return self._DFS_match_all(self.schemas, request)
|
||||
|
||||
def match(self, request: dict[str, Any]):
|
||||
request = request | self.defaults
|
||||
return self._DFS_match(self.schemas, request)
|
||||
|
||||
|
||||
class FDBSchemaFile(FDBSchema):
|
||||
def __init__(self, path: str):
|
||||
with open(path, "r") as f:
|
||||
return super().__init__(f.read())
|
83
tree_compresser/src/TreeTraverser/fdb_schema/fdb_types.py
Normal file
83
tree_compresser/src/TreeTraverser/fdb_schema/fdb_types.py
Normal file
@ -0,0 +1,83 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, date, time
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class FDBType:
|
||||
"""
|
||||
Holds information about how to format and validate a given FDB Schema type like Time or Expver
|
||||
This base type represents a string and does no validation or formatting. It's the default type.
|
||||
"""
|
||||
|
||||
name: str = "String"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.name
|
||||
|
||||
def validate(self, s: Any) -> bool:
|
||||
try:
|
||||
self.parse(s)
|
||||
return True
|
||||
except (ValueError, AssertionError):
|
||||
return False
|
||||
|
||||
def format(self, s: Any) -> str:
|
||||
return str(s).lower()
|
||||
|
||||
def parse(self, s: str) -> Any:
|
||||
return s
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class Expver_FDBType(FDBType):
|
||||
name: str = "Expver"
|
||||
|
||||
def parse(self, s: str) -> str:
|
||||
assert bool(re.match(".{4}", s))
|
||||
return s
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class Time_FDBType(FDBType):
|
||||
name: str = "Time"
|
||||
time_format = "%H%M"
|
||||
|
||||
def format(self, t: time) -> str:
|
||||
return t.strftime(self.time_format)
|
||||
|
||||
def parse(self, s: datetime | str | int) -> time:
|
||||
if isinstance(s, str):
|
||||
assert len(s) == 4
|
||||
return datetime.strptime(s, self.time_format).time()
|
||||
if isinstance(s, datetime):
|
||||
return s.time()
|
||||
return self.parse(f"{s:04}")
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class Date_FDBType(FDBType):
|
||||
name: str = "Date"
|
||||
date_format: str = "%Y%m%d"
|
||||
|
||||
def format(self, d: Any) -> str:
|
||||
if isinstance(d, date):
|
||||
return d.strftime(self.date_format)
|
||||
if isinstance(d, int):
|
||||
return f"{d:08}"
|
||||
else:
|
||||
return d
|
||||
|
||||
def parse(self, s: datetime | str | int) -> date:
|
||||
if isinstance(s, str):
|
||||
return datetime.strptime(s, self.date_format).date()
|
||||
elif isinstance(s, datetime):
|
||||
return s.date()
|
||||
return self.parse(f"{s:08}")
|
||||
|
||||
|
||||
FDB_type_to_implementation = defaultdict(lambda: FDBType()) | {
|
||||
cls.name: cls() for cls in [Expver_FDBType, Time_FDBType, Date_FDBType]
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user