From aaafa28dfb4397ec2261b4716a69129c2b6fe7a0 Mon Sep 17 00:00:00 2001 From: Tom Date: Thu, 29 May 2025 17:09:17 +0200 Subject: [PATCH] A bit more on the rust backend --- src/python/qubed/Qube.py | 12 ++--- src/python/qubed/value_types.py | 47 +++++++++++----- src/rust/lib.rs | 69 +++++++++++++++++++++--- src/rust/python_interface.rs | 96 +++++++++++++++++++++++++++++---- src/rust/serialisation/json.rs | 24 +++++++-- src/rust/set_operations.rs | 40 ++++++++++++++ test_scripts/rust.py | 92 +++++++++++++++++++++++++------ tests/test_iteration.py | 26 +++++++++ tests/test_rust.py | 30 +++++------ 9 files changed, 363 insertions(+), 73 deletions(-) create mode 100644 src/rust/set_operations.rs diff --git a/src/python/qubed/Qube.py b/src/python/qubed/Qube.py index b1bdfa1..1994840 100644 --- a/src/python/qubed/Qube.py +++ b/src/python/qubed/Qube.py @@ -61,7 +61,6 @@ class QubeNamedRoot: "Helper class to print a custom root name" key: str - dtype: str = "str" children: tuple[Qube, ...] = () def summary(self) -> str: @@ -388,12 +387,13 @@ class Qube: for c in node.children: yield from to_list_of_cubes(c) - if not node.children: - yield {node.key: list(node.values)} + else: + if not node.children: + yield {node.key: list(node.values)} - for c in node.children: - for sub_cube in to_list_of_cubes(c): - yield {node.key: list(node.values)} | sub_cube + for c in node.children: + for sub_cube in to_list_of_cubes(c): + yield {node.key: list(node.values)} | sub_cube return to_list_of_cubes(self) diff --git a/src/python/qubed/value_types.py b/src/python/qubed/value_types.py index e72f593..af7a5f6 100644 --- a/src/python/qubed/value_types.py +++ b/src/python/qubed/value_types.py @@ -65,6 +65,20 @@ class ValueGroup(ABC): T = TypeVar("T") EnumValuesType = FrozenSet[T] +_dtype_map: dict[str, type] = { + "str": str, + "int64": int, + "float64": float, + "date": datetime, +} +_dtype_map_inv: dict[type, str] = {v: k for k, v in _dtype_map.items()} +_dtype_formatters = { + "str": str, + "int64": int, + "float64": float, + "date": datetime.fromisoformat, +} + @dataclass(frozen=True, order=True) class QEnum(ValueGroup): @@ -76,10 +90,12 @@ class QEnum(ValueGroup): values: EnumValuesType _dtype: str = "str" - def __init__(self, obj): + def __init__(self, obj, dtype="str"): object.__setattr__(self, "values", tuple(sorted(obj))) object.__setattr__( - self, "dtype", type(self.values[0]) if len(self.values) > 0 else "str" + self, + "_dtype", + dtype, ) def __post_init__(self): @@ -108,7 +124,18 @@ class QEnum(ValueGroup): return min(self.values) def to_json(self): - return list(self.values) + return {"type": "enum", "dtype": self.dtype(), "values": self.values} + + # @classmethod + # def from_json(cls, type: Literal["enum"], dtype: str, values: list): + # dtype_formatter = _dtype_formatters[dtype] + + @classmethod + def from_list(cls, obj): + example = obj[0] + dtype = type(example) + assert [type(v) is dtype for v in obj] + return cls(obj, dtype=_dtype_map_inv[dtype]) @dataclass(frozen=True, order=True) @@ -389,17 +416,13 @@ class IntRange(Range): return ranges -def values_from_json(obj) -> ValueGroup: +def values_from_json(obj: dict | list) -> ValueGroup: if isinstance(obj, list): - return QEnum(tuple(obj)) + return QEnum.from_list(obj) - match obj["dtype"]: - case "date": - return DateRange(**obj) - case "time": - return TimeRange(**obj) - case "int": - return IntRange(**obj) + match obj["type"]: + case "enum": + QEnum.from_json(**obj) case _: raise ValueError(f"Unknown dtype {obj['dtype']}") diff --git a/src/rust/lib.rs b/src/rust/lib.rs index e09b95d..2411e30 100644 --- a/src/rust/lib.rs +++ b/src/rust/lib.rs @@ -3,10 +3,12 @@ use pyo3::prelude::*; use pyo3::wrap_pyfunction; use pyo3::types::{PyDict, PyInt, PyList, PyString}; +use python_interface::QubeError; use std::collections::HashMap; +use std::iter; use pyo3::prelude::*; use std::hash::Hash; - +use std::rc::Rc; use lasso::{Rodeo, Spur}; use std::num::NonZero; @@ -15,6 +17,7 @@ use std::ops; mod serialisation; mod python_interface; mod formatters; +mod set_operations; // This data structure uses the Newtype Index Pattern // See https://matklad.github.io/2018/06/04/newtype-index-pattern.html @@ -51,9 +54,6 @@ impl ops::Index for Qube { } impl NodeId { - pub fn new_infallible(value: NonZero) -> NodeId { - NodeId(value) - } pub fn new(value: usize) -> Option { NonZero::new(value).map(NodeId) } @@ -70,7 +70,7 @@ impl ops::Index for lasso::Rodeo { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct Node { pub key: StringId, pub metadata: HashMap>, @@ -115,9 +115,14 @@ impl Node { .map(|v| v.len()) .sum() } + + fn keys<'a>(&'a self, q: &'a Qube) -> impl Iterator { + self.children.keys() + .map(|s| {&q[*s]}) + } } -#[derive(Debug)] +#[derive(Debug, Clone)] #[pyclass(subclass, dict)] pub struct Qube { pub root: NodeId, @@ -142,7 +147,7 @@ impl Qube { StringId(self.strings.get_or_intern(val)) } - pub fn add_node(&mut self, parent: NodeId, key: &str, values: &[&str]) -> NodeId { + pub(crate) fn add_node(&mut self, parent: NodeId, key: &str, values: &[&str]) -> NodeId { let key_id = self.get_or_intern(key); let values = values.iter().map(|val| self.get_or_intern(val)).collect(); @@ -172,11 +177,59 @@ impl Qube { let node = &self[node_id]; node.summary(&self) } + + fn get_node_ref(&self, id: NodeId) -> NodeRef { + let node = &self[id]; + NodeRef { id: id, node: &node, qube: &self } + } + + pub fn get_string_id(&self, s: &str) -> Option { + self.strings.get(s) + .map(|id| StringId(id)) + } } #[pymodule] -fn rust(m: &Bound<'_, PyModule>) -> PyResult<()> { +fn rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; + m.add("QubeError", py.get_type::())?; Ok(()) } + + +pub struct NodeRef<'a> { + pub id: NodeId, + pub node: &'a Node, + pub qube: &'a Qube, +} + +impl<'a> NodeRef<'a> { + pub fn keys(&self) -> impl Iterator { + self.node.keys(self.qube) + } + + fn flat_children(&'a self) -> impl Iterator { + self.node.children + .values() + .flatten() + .map(|id| { + NodeRef { id: *id, node: &self.qube[*id], qube: self.qube } + }) + } + + fn children_by_key(&'a self, key: &str) -> impl Iterator { + let id = self.qube.get_string_id(key); + let children = id + .map(|i| self.node.children.get(&i)) + .flatten(); + + children.map( + |ids| ids.into_iter().map( + |id| { + NodeRef { id: *id, node: &self.qube[*id], qube: self.qube } + })).into_iter().flatten() + } + + +} diff --git a/src/rust/python_interface.rs b/src/rust/python_interface.rs index 53a7c46..fc6519f 100644 --- a/src/rust/python_interface.rs +++ b/src/rust/python_interface.rs @@ -1,19 +1,34 @@ -use crate::{Node, NodeId, Qube}; +use crate::{Node, NodeId, Qube, NodeRef}; use pyo3::prelude::*; -use pyo3::types::PyList; +use pyo3::types::{PyList, PyType}; +use core::borrow; use std::ops::Deref; +use std::cell::Ref; +use crate::set_operations; use crate::serialisation; +use itertools::Itertools; + +use pyo3::create_exception; + +create_exception!(qubed, QubeError, pyo3::exceptions::PyException); /// A reference to a particular node in a Qube #[pyclass] -pub struct NodeRef { +pub struct PyNodeRef { id: NodeId, qube: Py, // see https://pyo3.rs/v0.23.1/types for a discussion of Py and Bound<'py, T> } +fn into_py_node_ref(node_ref: NodeRef, qube: Py) -> PyNodeRef { + PyNodeRef { + id: node_ref.id, + qube: qube, + } +} + #[pymethods] -impl NodeRef { +impl PyNodeRef { fn __repr__(&self, py: Python) -> PyResult { // Get the Py reference, bind it to the GIL. let qube = self.qube.bind(py); @@ -43,13 +58,13 @@ impl NodeRef { } #[getter] - pub fn get_children(&self, py: Python) -> Vec { + pub fn get_children(&self, py: Python) -> Vec { let qube = self.qube.bind(py).borrow(); let node = &qube[self.id]; node.children .values() .flatten() - .map(|child_id| NodeRef { + .map(|child_id| Self { id: *child_id, qube: self.qube.clone_ref(py), }) @@ -57,6 +72,21 @@ impl NodeRef { } } +#[derive(FromPyObject)] +pub enum OneOrMany { + One(T), + Many(Vec), +} + +impl Into> for OneOrMany { + fn into(self) -> Vec { + match self { + OneOrMany::One(v) => vec![v], + OneOrMany::Many(vs) => vs, + } + } +} + #[pymethods] impl Qube { #[new] @@ -64,16 +94,56 @@ impl Qube { Qube::new() } + #[pyo3(name = "add_node")] + pub fn py_add_node( + slf: Bound<'_, Self>, + parent: PyRef<'_, PyNodeRef>, + key: &str, + values: OneOrMany, + ) -> PyResult { + // Check that the given parent is actually in this qube and not another one + if !parent.qube.bind(slf.py()).is(&slf) { + return Err(QubeError::new_err("Supplied parent node is not in the target qube.")) + } + + // massage values from T | Vec into Vec + let values: Vec = values.into(); + let values_refs: Vec<&str> = values.iter().map(String::as_str).collect(); + + let mut q = slf.borrow_mut(); + let node_id = q.add_node(parent.id, key, &values_refs); + Ok(PyNodeRef { id: node_id, qube: slf.into()}) + } + + pub fn set_root( + slf: Bound<'_, Self>, + node: PyRef<'_, PyNodeRef>, + ) -> () { + let mut q = slf.borrow_mut(); + q.root = node.id; + } + #[getter] - fn get_root(slf: Bound<'_, Self>) -> PyResult { - Ok(NodeRef { + fn get_root(slf: Bound<'_, Self>) -> PyResult { + Ok(PyNodeRef { id: slf.borrow().root, qube: slf.unbind(), }) } fn __repr__(&self) -> String { - self.string_tree() + // format!("{:?}", self) + let nodes_str: String = self.nodes.iter() + .enumerate() + .map(|(id, node)| { + format!("{{id: {}, key: {}, values: [{}], children: [{}]}}", + id+1, + &self[node.key], + node.values.iter().map(|s| &self[*s]).join(", "), + node.children().map(|n| n.0).join(", "), + ) + }).join(", "); + format!("Qube {{root: {}, nodes: {}}}", self.root.0, nodes_str) } fn __str__<'py>(&self) -> String { @@ -90,8 +160,8 @@ impl Qube { } #[getter] - pub fn get_children(slf: Bound<'_, Self>, py: Python) -> PyResult> { - let root = NodeRef { + pub fn get_children(slf: Bound<'_, Self>, py: Python) -> PyResult> { + let root = PyNodeRef { id: slf.borrow().root, qube: slf.unbind(), }; @@ -102,4 +172,8 @@ impl Qube { pub fn from_json(data: &str) -> Result { serialisation::from_json(data) } + + pub fn __or__(slf: Bound<'_, Self>, other: Bound<'_, Qube>) -> Qube { + set_operations::set_operation(&slf.borrow(), &other.borrow(), set_operations::Op::Union) + } } diff --git a/src/rust/serialisation/json.rs b/src/rust/serialisation/json.rs index bb4c94e..86d6796 100644 --- a/src/rust/serialisation/json.rs +++ b/src/rust/serialisation/json.rs @@ -24,10 +24,23 @@ impl From for JSONError { } #[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] +#[serde(tag = "dtype")] +enum Ranges { + Int64{values: Vec<(i64, i64)>} +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(tag = "dtype", rename_all = "lowercase")] +enum Enum { + Str{values: Vec} +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(tag = "type", rename_all = "lowercase")] enum Values { - Wildcard(String), - Enum(Vec), + Wildcard{}, + Enum(Enum), + Range(Ranges) } #[derive(Serialize, Deserialize, Debug)] @@ -43,8 +56,9 @@ fn add_nodes(qube: &mut Qube, parent: NodeId, nodes: &[JSONQube]) -> Vec .iter() .map(|json_node| { let values = match &json_node.values { - Values::Wildcard(_) => &vec!["*"], - Values::Enum(strings) => &strings.iter().map(|s| s.as_str()).collect(), + Values::Wildcard{} => &vec!["*"], + Values::Enum(Enum::Str{values}) => &values.iter().map(|s| s.as_str()).collect(), + Values::Range(_) => todo!(), }; let node_id = qube.add_node(parent, &json_node.key, values); diff --git a/src/rust/set_operations.rs b/src/rust/set_operations.rs new file mode 100644 index 0000000..0271677 --- /dev/null +++ b/src/rust/set_operations.rs @@ -0,0 +1,40 @@ +use crate::NodeRef; +use crate::{Node, NodeId, Qube}; +use itertools::chain; +use std::collections::HashSet; + +pub enum Op { + Union, + Intersection, + Difference, + SymmetricDifference, +} + +fn op_to_venn_diagram(op: Op) -> (bool, bool, bool) { + use Op::*; + match op { + Union => (true, true, true), + Intersection => (false, true, false), + Difference => (true, false, false), + SymmetricDifference => (true, false, true), + } +} + +pub fn set_operation<'a>(a: &'a Qube, b: &'a Qube, op: Op) -> Qube { + todo!() + // _set_operation(a.root_ref(), a.root_ref(), op) +} + +// fn _set_operation<'a>(a: NodeRef, b: NodeRef, op: Op) -> Qube { +// let keys: HashSet<&str> = HashSet::from_iter(chain(a.keys(), b.keys())); + +// for key in keys { +// let a = a.children_by_key(key) +// } + +// todo!() +// } + +pub fn set_operation_inplace<'a>(a: &'a mut Qube, b: &'a Qube, op: Op) -> &'a Qube { + a +} diff --git a/test_scripts/rust.py b/test_scripts/rust.py index 5d84567..ca418bd 100644 --- a/test_scripts/rust.py +++ b/test_scripts/rust.py @@ -1,21 +1,81 @@ from __future__ import annotations -import json +from datetime import datetime +from typing import Sequence -from qubed import Qube as pyQube -from qubed.rust import Qube as Qube +from qubed.rust import Qube as rsQube -q = pyQube.from_tree(""" -root, class=d1 -├── dataset=another-value, generation=1/2/3 -└── dataset=climate-dt/weather-dt, generation=1/2/3/4 -""") -json_str = json.dumps(q.to_json()) -rust_qube = Qube.from_json(json_str) -print(repr(rust_qube)) +# q = pyQube.from_tree(""" +# root, class=d1 +# ├── dataset=another-value, generation=1/2/3 +# └── dataset=climate-dt/weather-dt, generation=1/2/3/4 +# """) +# json_str = json.dumps(q.to_json()) +# rust_qube = Qube.from_json(json_str) +# # print(repr(rust_qube)) -expected = """root, class=d1 -├── dataset=another-value, generation=1/2/3 -└── dataset=climate-dt/weather-dt, generation=1/2/3/4""" -assert repr(rust_qube) == expected -# print(rs_qube._repr_html_()) +# # print(json_str) + +# expected = """root, class=d1 +# ├── dataset=another-value, generation=1/2/3 +# └── dataset=climate-dt/weather-dt, generation=1/2/3/4 +# """ +# assert repr(rust_qube) == expected +# # print(rs_qube._repr_html_()) + +# print(q | q) + +value = str | int | float | datetime + + +class Qube(rsQube): + @classmethod + def empty(cls): + q = cls() + print(f"empty called {cls = } {q = }") + return q + + @classmethod + def from_datacube(cls, datacube: dict[str, value | Sequence[value]]) -> Qube: + qube = cls.empty() + (key, values), *key_vals = list(datacube.items()) + node = qube.add_node(qube.root, key, values) + for key, values in key_vals: + node = qube.add_node(parent=node, key=key, values=values) + + return qube + + @classmethod + def from_dict(cls, d: dict) -> Qube: + q = cls.empty() + + def from_dict(parent, d: dict): + for k, children in d.items(): + key, values = k.split("=") + values = values.split("/") + + node = q.add_node( + parent=parent, + key=key, + values=values, + ) + from_dict(parent=node, d=children) + + from_dict(q.root, d) + return q + + +q = Qube.from_datacube({"a": ["4"], "b": "test", "c": ["1", "2", "3"]}) + +print(q) +print(repr(q)) + +q = Qube.from_dict( + { + "a=2/3": {"b=1": {}}, + "a2=a/b": {"b2=1/2": {}}, + } +) + +print(q) +print(repr(q)) diff --git a/tests/test_iteration.py b/tests/test_iteration.py index 10d68f6..0e9ad03 100644 --- a/tests/test_iteration.py +++ b/tests/test_iteration.py @@ -16,3 +16,29 @@ def test_iter_leaves_simple(): ] assert set(make_hashable(q.leaves())) == set(make_hashable(entries)) + + +def test_datacubes(): + q = Qube.from_tree(""" + root, class=d1 + ├── date=19920101/19930101/19940101, params=1/2/3 + └── date=19950101 + ├── level=1/2/3, params=1/2/3/4 + └── params=1/2/3/4 + """) + assert len(list(q.datacubes())) == 3 + + assert list(q.datacubes()) == [ + { + "class": ["d1"], + "date": ["19920101", "19930101", "19940101"], + "params": ["1", "2", "3"], + }, + { + "class": ["d1"], + "date": ["19950101"], + "level": ["1", "2", "3"], + "params": ["1", "2", "3", "4"], + }, + {"class": ["d1"], "date": ["19950101"], "params": ["1", "2", "3", "4"]}, + ] diff --git a/tests/test_rust.py b/tests/test_rust.py index 6f5fc5a..65a8513 100644 --- a/tests/test_rust.py +++ b/tests/test_rust.py @@ -1,21 +1,21 @@ from __future__ import annotations -import json -from qubed import Qube as pyQube from qubed.rust import Qube as Qube -q = pyQube.from_tree(""" -root, class=d1 -├── dataset=another-value, generation=1/2/3 -└── dataset=climate-dt/weather-dt, generation=1/2/3/4 -""") -json_str = json.dumps(q.to_json()) -rust_qube = Qube.from_json(json_str) -print(repr(rust_qube)) -expected = """root, class=d1 -├── dataset=another-value, generation=1/2/3 -└── dataset=climate-dt/weather-dt, generation=1/2/3/4 -""" -assert repr(rust_qube) == expected +# def test_from_json(): +# q = pyQube.from_tree(""" +# root, class=d1 +# ├── dataset=another-value, generation=1/2/3 +# └── dataset=climate-dt/weather-dt, generation=1/2/3/4 +# """) +# json_str = json.dumps(q.to_json()) +# rust_qube = Qube.from_json(json_str) +# print(repr(rust_qube)) + +# expected = """root, class=d1 +# ├── dataset=another-value, generation=1/2/3 +# └── dataset=climate-dt/weather-dt, generation=1/2/3/4 +# """ +# assert repr(rust_qube) == expected