diff --git a/.gitignore b/.gitignore index 0eaedbc..500fb17 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ target/ *.so _build/ build/ -.ipynb_checkpoints/ \ No newline at end of file +.ipynb_checkpoints/ +dist/ +Cargo.lock \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index 6577bb0..0000000 --- a/Cargo.lock +++ /dev/null @@ -1,223 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 4 - -[[package]] -name = "autocfg" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "indoc" -version = "2.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" - -[[package]] -name = "itoa" -version = "1.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" - -[[package]] -name = "libc" -version = "0.2.169" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" - -[[package]] -name = "memchr" -version = "2.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" - -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - -[[package]] -name = "once_cell" -version = "1.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" - -[[package]] -name = "portable-atomic" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" - -[[package]] -name = "proc-macro2" -version = "1.0.93" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "pyo3" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" -dependencies = [ - "cfg-if", - "indoc", - "libc", - "memoffset", - "once_cell", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" -dependencies = [ - "heck", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn", -] - -[[package]] -name = "qubed" -version = "0.1.2" -dependencies = [ - "pyo3", - "serde", - "serde_json", -] - -[[package]] -name = "quote" -version = "1.0.38" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "ryu" -version = "1.0.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" - -[[package]] -name = "serde" -version = "1.0.217" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.217" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.138" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" -dependencies = [ - "itoa", - "memchr", - "ryu", - "serde", -] - -[[package]] -name = "syn" -version = "2.0.98" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "target-lexicon" -version = "0.12.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" - -[[package]] -name = "unicode-ident" -version = "1.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" - -[[package]] -name = "unindent" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" diff --git a/Cargo.toml b/Cargo.toml index d7e97c6..3531a47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "qubed" -version = "0.1.2" +# version = "0.1.5" edition = "2021" repository = "https://github.com/ecmwf/qubed" diff --git a/pyproject.toml b/pyproject.toml index 994471d..fcdd86e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta" +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" [project] name = "qubed" diff --git a/src/python/qubed/__main__.py b/src/python/qubed/__main__.py index eb476b7..5667e84 100644 --- a/src/python/qubed/__main__.py +++ b/src/python/qubed/__main__.py @@ -56,11 +56,12 @@ def convert(args): new_branch = Qube.from_datacube(datacube) q = (q | Qube.from_datacube(datacube)) - output = match args.output_format: - case "text": - str(q) - case "html": - q.html() + # output = match args.output_format: + # case "text": + # str(q) + # case "html": + # q.html() + output = "fw" with open(args.output, "w") as f: f.write(output) diff --git a/src/python/qubed/fdb_schema/__init__.py b/src/python/qubed/fdb_schema/__init__.py deleted file mode 100644 index 56160a5..0000000 --- a/src/python/qubed/fdb_schema/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .fdb_schema_parser import FDBSchema, FDBSchemaFile, KeySpec, Key diff --git a/src/python/qubed/fdb_schema/fdb_schema_parser.py b/src/python/qubed/fdb_schema/fdb_schema_parser.py deleted file mode 100644 index 852c1bb..0000000 --- a/src/python/qubed/fdb_schema/fdb_schema_parser.py +++ /dev/null @@ -1,375 +0,0 @@ -import dataclasses -import json -from dataclasses import dataclass, field -from typing import Any - -import pe -from pe.actions import Pack -from pe.operators import Class, Star - -from .fdb_types import FDB_type_to_implementation, FDBType - - -@dataclass(frozen=True) -class KeySpec: - """ - Represents the specification of a single key in an FDB schema file. For example in - ``` - [ class, expver, stream=lwda, date, time, domain? - [ type=ofb/mfb/oai - [ obsgroup, reportype ]]] - ``` - class, expver, type=ofdb/mfb/oai etc are the KeySpecs - - These can have additional information such as: flags like `domain?`, allowed values like `type=ofb/mfb/oai` - or specify type information with `date: ClimateMonthly` - - """ - - key: str - type: FDBType = field(default_factory=FDBType) - flag: str | None = None - values: tuple = field(default_factory=tuple) - comment: str = "" - - def __repr__(self): - repr = self.key - if self.flag: - repr += self.flag - # if self.type: - # repr += f":{self.type}" - if self.values: - repr += "=" + "/".join(self.values) - return repr - - def matches(self, key, value): - # Sanity check! - if self.key != key: - return False - - # Some keys have a set of allowed values type=ofb/mfb/oai - if self.values: - if value not in self.values: - return False - - # Check the formatting of values like Time or Date - if self.type and not self.type.validate(value): - return False - - return True - - def is_optional(self): - if self.flag is None: - return False - return "?" in self.flag - - def is_allable(self): - if self.flag is None: - return False - return "*" in self.flag - - -@dataclass(frozen=True) -class Comment: - "Represents a comment node in the schema" - - value: str - - -@dataclass(frozen=True) -class FDBSchemaTypeDef: - "Mapping between FDB schema key names and FDB Schema Types, i.e expver is of type Expver" - - key: str - type: str - - -# This is the schema grammar written in PEG format -fdb_schema = pe.compile( - r""" - FDB < Line+ EOF - Line < Schema / Comment / TypeDef / empty - - # Comments - Comment <- "#" ~non_eol* - non_eol <- [\x09\x20-\x7F] / non_ascii - non_ascii <- [\x80-\uD7FF\uE000-\U0010FFFF] - - # Default Type Definitions - TypeDef < String ":" String ";" - - # Schemas are the main attraction - # They're a tree of KeySpecs. - Schema < "[" KeySpecs (","? Schema)* "]" - - # KeySpecs can be just a name i.e expver - # Can also have a type expver:int - # Or a flag expver? - # Or values expver=xxx - KeySpecs < KeySpec_ws ("," KeySpec_ws)* - KeySpec_ws < KeySpec - KeySpec <- key:String (flag:Flag)? (type:Type)? (values:Values)? ([ ]* comment:Comment)? - Flag <- ~("?" / "-" / "*") - Type <- ":" [ ]* String - Values <- "=" Value ("/" Value)* - - # Low level stuff - Value <- ~([-a-zA-Z0-9_]+) - String <- ~([a-zA-Z0-9_]+) - EOF <- !. - empty <- "" - """, - actions={ - "Schema": Pack(tuple), - "KeySpec": KeySpec, - "Values": Pack(tuple), - "Comment": Comment, - "TypeDef": FDBSchemaTypeDef, - }, - ignore=Star(Class("\t\f\r\n ")), - # flags=pe.DEBUG, -) - - -def post_process(entries): - "Take the raw output from the PEG parser and split it into type definitions and schema entries." - typedefs = {} - schemas = [] - for entry in entries: - match entry: - case c if isinstance(c, Comment): - pass - case t if isinstance(t, FDBSchemaTypeDef): - typedefs[t.key] = t.type - case s if isinstance(s, tuple): - schemas.append(s) - case _: - raise ValueError - return typedefs, tuple(schemas) - - -def determine_types(types, node): - "Recursively walk a schema tree and insert the type information." - if isinstance(node, tuple): - return [determine_types(types, n) for n in node] - return dataclasses.replace(node, type=types.get(node.key, FDBType())) - - -@dataclass -class Key: - key: str - value: Any - key_spec: KeySpec - reason: str - - def str_value(self): - return self.key_spec.type.format(self.value) - - def __bool__(self): - return self.reason in {"Matches", "Skipped", "Select All"} - - def emoji(self): - return {"Matches": "✅", "Skipped": "⏭️", "Select All": "★"}.get( - self.reason, "❌" - ) - - def info(self): - return f"{self.emoji()} {self.key:<12}= {str(self.value):<12} ({self.key_spec}) {self.reason if not self else ''}" - - def __repr__(self): - return f"{self.key}={self.key_spec.type.format(self.value)}" - - def as_json(self): - return dict( - key=self.key, - value=self.str_value(), - reason=self.reason, - ) - - -class FDBSchema: - """ - Represents a parsed FDB Schema file. - Has methods to validate and convert request dictionaries to a mars request form with validation and type information. - """ - - def __init__(self, string, defaults: dict[str, str] = {}): - """ - 1. Use a PEG parser on a schema string, - 2. Separate the output into schemas and typedefs - 3. Insert any concrete implementations of types from fdb_types.py defaulting to generic string type - 4. Walk the schema tree and annotate it with type information. - """ - m = fdb_schema.match(string) - g = list(m.groups()) - self._str_types, schemas = post_process(g) - self.types = { - key: FDB_type_to_implementation[type] - for key, type in self._str_types.items() - } - self.schemas = determine_types(self.types, schemas) - self.defaults = defaults - - def __repr__(self): - return json.dumps( - dict(schemas=self.schemas, defaults=self.defaults), indent=4, default=repr - ) - - @classmethod - def consume_key( - cls, key_spec: KeySpec, request: dict[str, Any] - ) -> Key: - key = key_spec.key - try: - value = request[key] - except KeyError: - if key_spec.is_optional(): - return Key(key_spec.key, "", key_spec, "Skipped") - if key_spec.is_allable(): - return Key(key_spec.key, "", key_spec, "Select All") - else: - return Key( - key_spec.key, "", key_spec, "Key Missing" - ) - - if key_spec.matches(key, value): - return Key( - key_spec.key, - key_spec.type.parse(value), - key_spec, - "Matches", - ) - else: - return Key( - key_spec.key, value, key_spec, "Incorrect Value" - ) - - @classmethod - def _DFS_match( - cls, tree: list, request: dict[str, Any] - ) -> tuple[bool | list, list[Key]]: - """Do a DFS on the schema tree, returning the deepest matching path - At each stage return whether we matched on this path, and the path itself. - - When traversing the tree there are three cases to consider: - 1. base case [] - 2. one schema [k, k, k, [k, k, k]] - 3. list of schemas [[k,k,k], [k,k,k], [k,k,k]] - """ - # Case 1: Base Case - if not tree: - return True, [] - - # Case 2: [k, k, k, [k, k, k]] - if isinstance(tree[0], KeySpec): - node, *tree = tree - # Check if this node is in the request - match_result = cls.consume_key(node, request) - - # If if isn't then terminate this path here - if not match_result: - return False, [match_result,] # fmt: skip - - # Otherwise continue walking the tree and return the best result - matched, path = cls._DFS_match(tree, request) - - # Don't put the key in the path if it's optional and we're skipping it. - if match_result.reason != "Skipped": - path = [match_result,] + path # fmt: skip - - return matched, path - - # Case 3: [[k, k, k], [k, k, k]] - branches = [] - for branch in tree: - matched, branch_path = cls._DFS_match(branch, request) - - # If this branch matches, terminate the DFS and use this. - if matched: - return branch, branch_path - else: - branches.append(branch_path) - - # If no branch matches, return the one with the deepest match - return False, max(branches, key=len) - - @classmethod - def _DFS_match_all( - cls, tree: list, request: dict[str, Any] - ) -> list[list[Key]]: - """Do a DFS on the schema tree, returning all matching paths or partial matches. - At each stage return all matching paths and the deepest partial matches. - - When traversing the tree there are three cases to consider: - 1. base case [] - 2. one schema [k, k, k, [k, k, k]] - 3. list of schemas [[k,k,k], [k,k,k], [k,k,k]] - """ - # Case 1: Base Case - if not tree: - return [[]] - - # Case 2: [k, k, k, [k, k, k]] - if isinstance(tree[0], KeySpec): - node, *tree = tree - # Check if this node is in the request - request_values = request.get(node.key, None) - - if request_values is None: - # If the key is not in the request, return a partial match with Key Missing - return [[Key(node.key, "", node, "Key Missing")]] - - # If the request value is a list, try to match each value - if isinstance(request_values, list): - all_matches = [] - for value in request_values: - match_result = cls.consume_key(node, {node.key: value}) - - if match_result: - sub_matches = cls._DFS_match_all(tree, request) - for match in sub_matches: - if match_result.reason != "Skipped": - match.insert(0, match_result) - all_matches.append(match) - - return all_matches if all_matches else [[Key(node.key, "", node, "No Match Found")]] - else: - # Handle a single value - match_result = cls.consume_key(node, request) - - # If it isn't then return a partial match with Key Missing - if not match_result: - return [[Key(node.key, "", node, "Key Missing")]] - - # Continue walking the tree and get all matches - all_matches = cls._DFS_match_all(tree, request) - - # Prepend the current match to all further matches - for match in all_matches: - if match_result.reason != "Skipped": - match.insert(0, match_result) - - return all_matches - - # Case 3: [[k, k, k], [k, k, k]] - all_branch_matches = [] - for branch in tree: - branch_matches = cls._DFS_match_all(branch, request) - all_branch_matches.extend(branch_matches) - - # Return all of the deepest partial matches or complete matches - return all_branch_matches - - def match_all(self, request: dict[str, Any]): - request = request | self.defaults - return self._DFS_match_all(self.schemas, request) - - def match(self, request: dict[str, Any]): - request = request | self.defaults - return self._DFS_match(self.schemas, request) - - -class FDBSchemaFile(FDBSchema): - def __init__(self, path: str): - with open(path, "r") as f: - return super().__init__(f.read()) diff --git a/src/python/qubed/fdb_schema/fdb_types.py b/src/python/qubed/fdb_schema/fdb_types.py deleted file mode 100644 index 05093db..0000000 --- a/src/python/qubed/fdb_schema/fdb_types.py +++ /dev/null @@ -1,83 +0,0 @@ -from dataclasses import dataclass -from typing import Any -import re -from collections import defaultdict -from datetime import datetime, date, time - - -@dataclass(repr=False) -class FDBType: - """ - Holds information about how to format and validate a given FDB Schema type like Time or Expver - This base type represents a string and does no validation or formatting. It's the default type. - """ - - name: str = "String" - - def __repr__(self) -> str: - return self.name - - def validate(self, s: Any) -> bool: - try: - self.parse(s) - return True - except (ValueError, AssertionError): - return False - - def format(self, s: Any) -> str: - return str(s).lower() - - def parse(self, s: str) -> Any: - return s - - -@dataclass(repr=False) -class Expver_FDBType(FDBType): - name: str = "Expver" - - def parse(self, s: str) -> str: - assert bool(re.match(".{4}", s)) - return s - - -@dataclass(repr=False) -class Time_FDBType(FDBType): - name: str = "Time" - time_format = "%H%M" - - def format(self, t: time) -> str: - return t.strftime(self.time_format) - - def parse(self, s: datetime | str | int) -> time: - if isinstance(s, str): - assert len(s) == 4 - return datetime.strptime(s, self.time_format).time() - if isinstance(s, datetime): - return s.time() - return self.parse(f"{s:04}") - - -@dataclass(repr=False) -class Date_FDBType(FDBType): - name: str = "Date" - date_format: str = "%Y%m%d" - - def format(self, d: Any) -> str: - if isinstance(d, date): - return d.strftime(self.date_format) - if isinstance(d, int): - return f"{d:08}" - else: - return d - - def parse(self, s: datetime | str | int) -> date: - if isinstance(s, str): - return datetime.strptime(s, self.date_format).date() - elif isinstance(s, datetime): - return s.date() - return self.parse(f"{s:08}") - - -FDB_type_to_implementation = defaultdict(lambda: FDBType()) | { - cls.name: cls() for cls in [Expver_FDBType, Time_FDBType, Date_FDBType] -}