A bit more on the rust backend

This commit is contained in:
Tom 2025-05-29 17:09:17 +02:00
parent 3328a0375b
commit aaafa28dfb
9 changed files with 363 additions and 73 deletions

View File

@ -61,7 +61,6 @@ class QubeNamedRoot:
"Helper class to print a custom root name" "Helper class to print a custom root name"
key: str key: str
dtype: str = "str"
children: tuple[Qube, ...] = () children: tuple[Qube, ...] = ()
def summary(self) -> str: def summary(self) -> str:
@ -388,6 +387,7 @@ class Qube:
for c in node.children: for c in node.children:
yield from to_list_of_cubes(c) yield from to_list_of_cubes(c)
else:
if not node.children: if not node.children:
yield {node.key: list(node.values)} yield {node.key: list(node.values)}

View File

@ -65,6 +65,20 @@ class ValueGroup(ABC):
T = TypeVar("T") T = TypeVar("T")
EnumValuesType = FrozenSet[T] EnumValuesType = FrozenSet[T]
_dtype_map: dict[str, type] = {
"str": str,
"int64": int,
"float64": float,
"date": datetime,
}
_dtype_map_inv: dict[type, str] = {v: k for k, v in _dtype_map.items()}
_dtype_formatters = {
"str": str,
"int64": int,
"float64": float,
"date": datetime.fromisoformat,
}
@dataclass(frozen=True, order=True) @dataclass(frozen=True, order=True)
class QEnum(ValueGroup): class QEnum(ValueGroup):
@ -76,10 +90,12 @@ class QEnum(ValueGroup):
values: EnumValuesType values: EnumValuesType
_dtype: str = "str" _dtype: str = "str"
def __init__(self, obj): def __init__(self, obj, dtype="str"):
object.__setattr__(self, "values", tuple(sorted(obj))) object.__setattr__(self, "values", tuple(sorted(obj)))
object.__setattr__( object.__setattr__(
self, "dtype", type(self.values[0]) if len(self.values) > 0 else "str" self,
"_dtype",
dtype,
) )
def __post_init__(self): def __post_init__(self):
@ -108,7 +124,18 @@ class QEnum(ValueGroup):
return min(self.values) return min(self.values)
def to_json(self): def to_json(self):
return list(self.values) return {"type": "enum", "dtype": self.dtype(), "values": self.values}
# @classmethod
# def from_json(cls, type: Literal["enum"], dtype: str, values: list):
# dtype_formatter = _dtype_formatters[dtype]
@classmethod
def from_list(cls, obj):
example = obj[0]
dtype = type(example)
assert [type(v) is dtype for v in obj]
return cls(obj, dtype=_dtype_map_inv[dtype])
@dataclass(frozen=True, order=True) @dataclass(frozen=True, order=True)
@ -389,17 +416,13 @@ class IntRange(Range):
return ranges return ranges
def values_from_json(obj) -> ValueGroup: def values_from_json(obj: dict | list) -> ValueGroup:
if isinstance(obj, list): if isinstance(obj, list):
return QEnum(tuple(obj)) return QEnum.from_list(obj)
match obj["dtype"]: match obj["type"]:
case "date": case "enum":
return DateRange(**obj) QEnum.from_json(**obj)
case "time":
return TimeRange(**obj)
case "int":
return IntRange(**obj)
case _: case _:
raise ValueError(f"Unknown dtype {obj['dtype']}") raise ValueError(f"Unknown dtype {obj['dtype']}")

View File

@ -3,10 +3,12 @@
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::wrap_pyfunction; use pyo3::wrap_pyfunction;
use pyo3::types::{PyDict, PyInt, PyList, PyString}; use pyo3::types::{PyDict, PyInt, PyList, PyString};
use python_interface::QubeError;
use std::collections::HashMap; use std::collections::HashMap;
use std::iter;
use pyo3::prelude::*; use pyo3::prelude::*;
use std::hash::Hash; use std::hash::Hash;
use std::rc::Rc;
use lasso::{Rodeo, Spur}; use lasso::{Rodeo, Spur};
use std::num::NonZero; use std::num::NonZero;
@ -15,6 +17,7 @@ use std::ops;
mod serialisation; mod serialisation;
mod python_interface; mod python_interface;
mod formatters; mod formatters;
mod set_operations;
// This data structure uses the Newtype Index Pattern // This data structure uses the Newtype Index Pattern
// See https://matklad.github.io/2018/06/04/newtype-index-pattern.html // See https://matklad.github.io/2018/06/04/newtype-index-pattern.html
@ -51,9 +54,6 @@ impl ops::Index<StringId> for Qube {
} }
impl NodeId { impl NodeId {
pub fn new_infallible(value: NonZero<usize>) -> NodeId {
NodeId(value)
}
pub fn new(value: usize) -> Option<NodeId> { pub fn new(value: usize) -> Option<NodeId> {
NonZero::new(value).map(NodeId) NonZero::new(value).map(NodeId)
} }
@ -70,7 +70,7 @@ impl ops::Index<StringId> for lasso::Rodeo {
} }
} }
#[derive(Debug)] #[derive(Debug, Clone)]
pub(crate) struct Node { pub(crate) struct Node {
pub key: StringId, pub key: StringId,
pub metadata: HashMap<StringId, Vec<String>>, pub metadata: HashMap<StringId, Vec<String>>,
@ -115,9 +115,14 @@ impl Node {
.map(|v| v.len()) .map(|v| v.len())
.sum() .sum()
} }
fn keys<'a>(&'a self, q: &'a Qube) -> impl Iterator<Item = &'a str> {
self.children.keys()
.map(|s| {&q[*s]})
}
} }
#[derive(Debug)] #[derive(Debug, Clone)]
#[pyclass(subclass, dict)] #[pyclass(subclass, dict)]
pub struct Qube { pub struct Qube {
pub root: NodeId, pub root: NodeId,
@ -142,7 +147,7 @@ impl Qube {
StringId(self.strings.get_or_intern(val)) StringId(self.strings.get_or_intern(val))
} }
pub fn add_node(&mut self, parent: NodeId, key: &str, values: &[&str]) -> NodeId { pub(crate) fn add_node(&mut self, parent: NodeId, key: &str, values: &[&str]) -> NodeId {
let key_id = self.get_or_intern(key); let key_id = self.get_or_intern(key);
let values = values.iter().map(|val| self.get_or_intern(val)).collect(); let values = values.iter().map(|val| self.get_or_intern(val)).collect();
@ -172,11 +177,59 @@ impl Qube {
let node = &self[node_id]; let node = &self[node_id];
node.summary(&self) node.summary(&self)
} }
fn get_node_ref(&self, id: NodeId) -> NodeRef {
let node = &self[id];
NodeRef { id: id, node: &node, qube: &self }
}
pub fn get_string_id(&self, s: &str) -> Option<StringId> {
self.strings.get(s)
.map(|id| StringId(id))
}
} }
#[pymodule] #[pymodule]
fn rust(m: &Bound<'_, PyModule>) -> PyResult<()> { fn rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<Qube>()?; m.add_class::<Qube>()?;
m.add("QubeError", py.get_type::<python_interface::QubeError>())?;
Ok(()) Ok(())
} }
pub struct NodeRef<'a> {
pub id: NodeId,
pub node: &'a Node,
pub qube: &'a Qube,
}
impl<'a> NodeRef<'a> {
pub fn keys(&self) -> impl Iterator<Item = &str> {
self.node.keys(self.qube)
}
fn flat_children(&'a self) -> impl Iterator<Item = Self> {
self.node.children
.values()
.flatten()
.map(|id| {
NodeRef { id: *id, node: &self.qube[*id], qube: self.qube }
})
}
fn children_by_key(&'a self, key: &str) -> impl Iterator<Item = Self> {
let id = self.qube.get_string_id(key);
let children = id
.map(|i| self.node.children.get(&i))
.flatten();
children.map(
|ids| ids.into_iter().map(
|id| {
NodeRef { id: *id, node: &self.qube[*id], qube: self.qube }
})).into_iter().flatten()
}
}

View File

@ -1,19 +1,34 @@
use crate::{Node, NodeId, Qube}; use crate::{Node, NodeId, Qube, NodeRef};
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::PyList; use pyo3::types::{PyList, PyType};
use core::borrow;
use std::ops::Deref; use std::ops::Deref;
use std::cell::Ref;
use crate::set_operations;
use crate::serialisation; use crate::serialisation;
use itertools::Itertools;
use pyo3::create_exception;
create_exception!(qubed, QubeError, pyo3::exceptions::PyException);
/// A reference to a particular node in a Qube /// A reference to a particular node in a Qube
#[pyclass] #[pyclass]
pub struct NodeRef { pub struct PyNodeRef {
id: NodeId, id: NodeId,
qube: Py<Qube>, // see https://pyo3.rs/v0.23.1/types for a discussion of Py<T> and Bound<'py, T> qube: Py<Qube>, // see https://pyo3.rs/v0.23.1/types for a discussion of Py<T> and Bound<'py, T>
} }
fn into_py_node_ref(node_ref: NodeRef, qube: Py<Qube>) -> PyNodeRef {
PyNodeRef {
id: node_ref.id,
qube: qube,
}
}
#[pymethods] #[pymethods]
impl NodeRef { impl PyNodeRef {
fn __repr__(&self, py: Python) -> PyResult<String> { fn __repr__(&self, py: Python) -> PyResult<String> {
// Get the Py<Qube> reference, bind it to the GIL. // Get the Py<Qube> reference, bind it to the GIL.
let qube = self.qube.bind(py); let qube = self.qube.bind(py);
@ -43,13 +58,13 @@ impl NodeRef {
} }
#[getter] #[getter]
pub fn get_children(&self, py: Python) -> Vec<NodeRef> { pub fn get_children(&self, py: Python) -> Vec<Self> {
let qube = self.qube.bind(py).borrow(); let qube = self.qube.bind(py).borrow();
let node = &qube[self.id]; let node = &qube[self.id];
node.children node.children
.values() .values()
.flatten() .flatten()
.map(|child_id| NodeRef { .map(|child_id| Self {
id: *child_id, id: *child_id,
qube: self.qube.clone_ref(py), qube: self.qube.clone_ref(py),
}) })
@ -57,6 +72,21 @@ impl NodeRef {
} }
} }
#[derive(FromPyObject)]
pub enum OneOrMany<T> {
One(T),
Many(Vec<T>),
}
impl<T> Into<Vec<T>> for OneOrMany<T> {
fn into(self) -> Vec<T> {
match self {
OneOrMany::One(v) => vec![v],
OneOrMany::Many(vs) => vs,
}
}
}
#[pymethods] #[pymethods]
impl Qube { impl Qube {
#[new] #[new]
@ -64,16 +94,56 @@ impl Qube {
Qube::new() Qube::new()
} }
#[pyo3(name = "add_node")]
pub fn py_add_node(
slf: Bound<'_, Self>,
parent: PyRef<'_, PyNodeRef>,
key: &str,
values: OneOrMany<String>,
) -> PyResult<PyNodeRef> {
// Check that the given parent is actually in this qube and not another one
if !parent.qube.bind(slf.py()).is(&slf) {
return Err(QubeError::new_err("Supplied parent node is not in the target qube."))
}
// massage values from T | Vec<T> into Vec<T>
let values: Vec<String> = values.into();
let values_refs: Vec<&str> = values.iter().map(String::as_str).collect();
let mut q = slf.borrow_mut();
let node_id = q.add_node(parent.id, key, &values_refs);
Ok(PyNodeRef { id: node_id, qube: slf.into()})
}
pub fn set_root(
slf: Bound<'_, Self>,
node: PyRef<'_, PyNodeRef>,
) -> () {
let mut q = slf.borrow_mut();
q.root = node.id;
}
#[getter] #[getter]
fn get_root(slf: Bound<'_, Self>) -> PyResult<NodeRef> { fn get_root(slf: Bound<'_, Self>) -> PyResult<PyNodeRef> {
Ok(NodeRef { Ok(PyNodeRef {
id: slf.borrow().root, id: slf.borrow().root,
qube: slf.unbind(), qube: slf.unbind(),
}) })
} }
fn __repr__(&self) -> String { fn __repr__(&self) -> String {
self.string_tree() // format!("{:?}", self)
let nodes_str: String = self.nodes.iter()
.enumerate()
.map(|(id, node)| {
format!("{{id: {}, key: {}, values: [{}], children: [{}]}}",
id+1,
&self[node.key],
node.values.iter().map(|s| &self[*s]).join(", "),
node.children().map(|n| n.0).join(", "),
)
}).join(", ");
format!("Qube {{root: {}, nodes: {}}}", self.root.0, nodes_str)
} }
fn __str__<'py>(&self) -> String { fn __str__<'py>(&self) -> String {
@ -90,8 +160,8 @@ impl Qube {
} }
#[getter] #[getter]
pub fn get_children(slf: Bound<'_, Self>, py: Python) -> PyResult<Vec<NodeRef>> { pub fn get_children(slf: Bound<'_, Self>, py: Python) -> PyResult<Vec<PyNodeRef>> {
let root = NodeRef { let root = PyNodeRef {
id: slf.borrow().root, id: slf.borrow().root,
qube: slf.unbind(), qube: slf.unbind(),
}; };
@ -102,4 +172,8 @@ impl Qube {
pub fn from_json(data: &str) -> Result<Self, serialisation::JSONError> { pub fn from_json(data: &str) -> Result<Self, serialisation::JSONError> {
serialisation::from_json(data) serialisation::from_json(data)
} }
pub fn __or__(slf: Bound<'_, Self>, other: Bound<'_, Qube>) -> Qube {
set_operations::set_operation(&slf.borrow(), &other.borrow(), set_operations::Op::Union)
}
} }

View File

@ -24,10 +24,23 @@ impl From<serde_json::Error> for JSONError {
} }
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
#[serde(untagged)] #[serde(tag = "dtype")]
enum Ranges {
Int64{values: Vec<(i64, i64)>}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "dtype", rename_all = "lowercase")]
enum Enum {
Str{values: Vec<String>}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "type", rename_all = "lowercase")]
enum Values { enum Values {
Wildcard(String), Wildcard{},
Enum(Vec<String>), Enum(Enum),
Range(Ranges)
} }
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
@ -43,8 +56,9 @@ fn add_nodes(qube: &mut Qube, parent: NodeId, nodes: &[JSONQube]) -> Vec<NodeId>
.iter() .iter()
.map(|json_node| { .map(|json_node| {
let values = match &json_node.values { let values = match &json_node.values {
Values::Wildcard(_) => &vec!["*"], Values::Wildcard{} => &vec!["*"],
Values::Enum(strings) => &strings.iter().map(|s| s.as_str()).collect(), Values::Enum(Enum::Str{values}) => &values.iter().map(|s| s.as_str()).collect(),
Values::Range(_) => todo!(),
}; };
let node_id = qube.add_node(parent, &json_node.key, values); let node_id = qube.add_node(parent, &json_node.key, values);

View File

@ -0,0 +1,40 @@
use crate::NodeRef;
use crate::{Node, NodeId, Qube};
use itertools::chain;
use std::collections::HashSet;
pub enum Op {
Union,
Intersection,
Difference,
SymmetricDifference,
}
fn op_to_venn_diagram(op: Op) -> (bool, bool, bool) {
use Op::*;
match op {
Union => (true, true, true),
Intersection => (false, true, false),
Difference => (true, false, false),
SymmetricDifference => (true, false, true),
}
}
pub fn set_operation<'a>(a: &'a Qube, b: &'a Qube, op: Op) -> Qube {
todo!()
// _set_operation(a.root_ref(), a.root_ref(), op)
}
// fn _set_operation<'a>(a: NodeRef, b: NodeRef, op: Op) -> Qube {
// let keys: HashSet<&str> = HashSet::from_iter(chain(a.keys(), b.keys()));
// for key in keys {
// let a = a.children_by_key(key)
// }
// todo!()
// }
pub fn set_operation_inplace<'a>(a: &'a mut Qube, b: &'a Qube, op: Op) -> &'a Qube {
a
}

View File

@ -1,21 +1,81 @@
from __future__ import annotations from __future__ import annotations
import json from datetime import datetime
from typing import Sequence
from qubed import Qube as pyQube from qubed.rust import Qube as rsQube
from qubed.rust import Qube as Qube
q = pyQube.from_tree(""" # q = pyQube.from_tree("""
root, class=d1 # root, class=d1
dataset=another-value, generation=1/2/3 # ├── dataset=another-value, generation=1/2/3
dataset=climate-dt/weather-dt, generation=1/2/3/4 # └── dataset=climate-dt/weather-dt, generation=1/2/3/4
""") # """)
json_str = json.dumps(q.to_json()) # json_str = json.dumps(q.to_json())
rust_qube = Qube.from_json(json_str) # rust_qube = Qube.from_json(json_str)
print(repr(rust_qube)) # # print(repr(rust_qube))
expected = """root, class=d1 # # print(json_str)
dataset=another-value, generation=1/2/3
dataset=climate-dt/weather-dt, generation=1/2/3/4""" # expected = """root, class=d1
assert repr(rust_qube) == expected # ├── dataset=another-value, generation=1/2/3
# print(rs_qube._repr_html_()) # └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """
# assert repr(rust_qube) == expected
# # print(rs_qube._repr_html_())
# print(q | q)
value = str | int | float | datetime
class Qube(rsQube):
@classmethod
def empty(cls):
q = cls()
print(f"empty called {cls = } {q = }")
return q
@classmethod
def from_datacube(cls, datacube: dict[str, value | Sequence[value]]) -> Qube:
qube = cls.empty()
(key, values), *key_vals = list(datacube.items())
node = qube.add_node(qube.root, key, values)
for key, values in key_vals:
node = qube.add_node(parent=node, key=key, values=values)
return qube
@classmethod
def from_dict(cls, d: dict) -> Qube:
q = cls.empty()
def from_dict(parent, d: dict):
for k, children in d.items():
key, values = k.split("=")
values = values.split("/")
node = q.add_node(
parent=parent,
key=key,
values=values,
)
from_dict(parent=node, d=children)
from_dict(q.root, d)
return q
q = Qube.from_datacube({"a": ["4"], "b": "test", "c": ["1", "2", "3"]})
print(q)
print(repr(q))
q = Qube.from_dict(
{
"a=2/3": {"b=1": {}},
"a2=a/b": {"b2=1/2": {}},
}
)
print(q)
print(repr(q))

View File

@ -16,3 +16,29 @@ def test_iter_leaves_simple():
] ]
assert set(make_hashable(q.leaves())) == set(make_hashable(entries)) assert set(make_hashable(q.leaves())) == set(make_hashable(entries))
def test_datacubes():
q = Qube.from_tree("""
root, class=d1
date=19920101/19930101/19940101, params=1/2/3
date=19950101
level=1/2/3, params=1/2/3/4
params=1/2/3/4
""")
assert len(list(q.datacubes())) == 3
assert list(q.datacubes()) == [
{
"class": ["d1"],
"date": ["19920101", "19930101", "19940101"],
"params": ["1", "2", "3"],
},
{
"class": ["d1"],
"date": ["19950101"],
"level": ["1", "2", "3"],
"params": ["1", "2", "3", "4"],
},
{"class": ["d1"], "date": ["19950101"], "params": ["1", "2", "3", "4"]},
]

View File

@ -1,21 +1,21 @@
from __future__ import annotations from __future__ import annotations
import json
from qubed import Qube as pyQube
from qubed.rust import Qube as Qube from qubed.rust import Qube as Qube
q = pyQube.from_tree("""
root, class=d1
dataset=another-value, generation=1/2/3
dataset=climate-dt/weather-dt, generation=1/2/3/4
""")
json_str = json.dumps(q.to_json())
rust_qube = Qube.from_json(json_str)
print(repr(rust_qube))
expected = """root, class=d1 # def test_from_json():
dataset=another-value, generation=1/2/3 # q = pyQube.from_tree("""
dataset=climate-dt/weather-dt, generation=1/2/3/4 # root, class=d1
""" # ├── dataset=another-value, generation=1/2/3
assert repr(rust_qube) == expected # └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """)
# json_str = json.dumps(q.to_json())
# rust_qube = Qube.from_json(json_str)
# print(repr(rust_qube))
# expected = """root, class=d1
# ├── dataset=another-value, generation=1/2/3
# └── dataset=climate-dt/weather-dt, generation=1/2/3/4
# """
# assert repr(rust_qube) == expected