This commit is contained in:
Tom 2024-12-11 14:08:18 +00:00
parent 523773d467
commit 04a0cd8ab2
10 changed files with 1685 additions and 231 deletions

View File

@ -1,130 +0,0 @@
# * Format of the rules is:
# [a1, a2, a3 ...[b1, b2, b3... [c1, c2, c3...]]]
# - The first level (a) defines which attributes are used to name the top level directory
# - The second level (b) defines which attributes are used to name the data files
# - The third level (c) defines which attributes are used as index keys
# * Rules can be grouped
# [a1, a2, a3 ...
# [b1, b2, b3... [c1, c2, c3...]]
# [B1, B2, B3... [C1, C2, C3...]]
# ]
# * A list of values can be given for an attribute
# [ ..., stream=enfo/efov, ... ]
# This will be used when matching rules.
# * Attributes can be typed
# Globally, at the begining of this file:
# refdate: Date;
# or in the context of a rule:
# [type=cl, ... [date:ClimateMonth, ...]]
# Typing attributes is done when the user's requests or the GRIB values need to be modified before directories, files and indexes are created. For example, ClimateMonth will transform 2010-04-01 to 'may' internally.
# * Attributes can be optional
# [ step, levelist?, param ]
# They will be replaced internally by an empty value. It is also posiible to provide a default subtitution value: e.g. [domain?g] will consider the domain to be 'g' if missing.
# * Attributes can be removed:
# [grid-]
# This is useful to remove attributes present in the GRIB that should not be ignored
# * Rules are matched:
# - If the attributes are present in the GRIB/Request, or marked optional or ignored
# - If a list of possible value is provided, one of them must match, for example
# [ class, expver, stream=enfo/efov, date, time, domain ]
# will match either stream=enfo or stream=efov, all other attributes will be matched if they exist in the GRIB or user's request
# * On archive:
# - Attributes are extracted from the GRIB (namespace 'mars'), possibly modified by the attribute type
# - Only the first rule is used, so order is important
# - All GRIB attributes must be used by the rules, otherwise an error is raised
# * On retrieve:
# - Attributes are extracted from the user's request, possibly modified by the attribute type (e.g. for handling of U/V)
# - All the matching rules are considered
# - Only attributes listed in the rules are used to extract values from the user's request
# Default types
param: Param;
step: Step;
date: Date;
hdate: Date;
refdate: Date;
latitude: Double;
longitude: Double;
levelist: Double;
grid: Grid;
expver: Expver;
time: Time;
fcmonth: Integer;
number: Integer;
frequency: Integer;
direction: Integer;
channel: Integer;
instrument: Integer;
ident: Integer;
diagnostic: Integer;
iteration: Integer;
system: Integer;
method: Integer;
########################################################
# These are the rules for the Climate DT
# clte/wave
[ class=d1, dataset=climate-dt, activity, experiment, generation, model, realization, expver, stream=clte/wave, date
[ resolution, type, levtype
[ time, levelist?, param, frequency?, direction? ]]
]
# clmn
[ class=d1, dataset=climate-dt, activity, experiment, generation, model, realization, expver, stream=clmn, year
[ month, resolution, type, levtype
[ levelist?, param ]]
]
########################################################
# These are the rules for the Extremes DT
# oper/wave/lwda/lwwv
[ class=d1, dataset=extremes-dt, expver, stream=oper/wave/lwda/lwwv, date, time
[ resolution?, type, levtype
[ step, levelist?, param, frequency?, direction? ]]
]
########################################################
# These are the rules for the On-Demand Extremes DT
# oper/wave
[ class=d1, dataset=on-demand-extremes-dt, expver, stream=oper/wave, date, time
[ type, levtype
[ step, levelist?, param, frequency?, direction? ]]
]
########################################################
########################################################
#
# These are the rules for rd
# oper/wave/lwda/lwwv
[ class=rd, expver, stream=oper/wave/lwda/lwwv/dcda/enfo, date, time, domain?
[ type, levtype
[ number?, step, levelist?, param, frequency?, direction? ]]
]
[ class=rd, expver, stream=mnth, domain?
[ type, levtype
[ date , time, step?, levelist?, param ]]
]

1253
config/local/language.yaml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,24 +1,14 @@
import json
import yaml
from pathlib import Path
import os
from datetime import datetime
from collections import defaultdict
from typing import Any, Dict
import yaml
import os
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import RedirectResponse, FileResponse
from fastapi.templating import Jinja2Templates
from TreeTraverser.fdb_schema import FDBSchemaFile
from TreeTraverser.CompressedTree import CompressedTree
import redis
import yaml
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from tree_traverser import CompressedTree
app = FastAPI()
app.add_middleware(
@ -33,20 +23,26 @@ app.add_middleware(
async def favicon():
return FileResponse("favicon.ico")
with open(os.environ.get("CONFIG_DIR", ".") + "/config.yaml", "r") as f:
config = yaml.safe_load(f)
print("Getting cache from redis")
r = redis.Redis(host=os.environ.get("REDIS_HOST", "localhost"), port=6379, db=0)
json_data = r.get('compressed_catalog')
if not json_data:
print("Didn't find compressed tree in redis using empty tree")
c_tree = CompressedTree({})
if "local_cache" in config:
print("Getting cache from local file")
with open(config["local_cache"], "r") as f:
json_data = f.read()
print("Found compressed catalog in local file")
else:
print("Getting cache from redis")
r = redis.Redis(host=os.environ.get("REDIS_HOST", "localhost"), port=6379, db=0)
json_data = r.get('compressed_catalog')
print("Loading tree to json")
if not json_data:
c_tree = CompressedTree.from_json({})
else:
print("Loading tree to json")
compressed_tree_json = json.loads(json_data)
c_tree = CompressedTree.from_json(compressed_tree_json)
print("Partialy decompressing tree, shoud be able to skip this step in future.")
tree = c_tree.reconstruct_compressed_ecmwf_style()
@ -61,9 +57,6 @@ config = {
with open(config["mars_language"], "r") as f:
mars_language = yaml.safe_load(f)["_field"]
###### Load FDB Schema
schema = FDBSchemaFile(config["fdb_schema"])
def request_to_dict(request: Request) -> Dict[str, Any]:
# Convert query parameters to dictionary format
request_dict = dict(request.query_params)
@ -167,20 +160,6 @@ async def get_STAC(request: Request):
request_dict = request_to_dict(request)
paths = await api_paths(request)
# # Run the schema matching logic
# matches = schema.match_all(dict(v.split("=") for v in path))
# # Only take the longest matches
# max_len = max(len(m) for m in matches)
# matches = [m for m in matches if len(m) == max_len]
# # Take the ends of all partial matches, ignore those that are full matches
# # Full matches are indicated by the last key having boolean value True
# key_frontier = defaultdict(list)
# for match in matches:
# if not match[-1]:
# key_frontier[match[-1].key].append([m for m in match[:-1]])
def make_link(key_name, paths, values):
"""Take a MARS Key and information about which paths matched up to this point and use it to make a STAC Link"""
@ -221,7 +200,7 @@ async def get_STAC(request: Request):
def value_descriptions(key, values):
return {
v[0] : v[-1] for v in mars_language.get(key, {}).get("values", [])
if len(v) > 1 and v[0] in values
if len(v) > 1 and v[0] in list(values)
}
descriptions = {

View File

@ -1,3 +1,3 @@
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
cd "$parent_path"
REDIS_HOST=localhost CONFIG_DIR=../config/destinE fastapi dev ./main.py --port 8124 --reload
CONFIG_DIR=../config/local fastapi dev ./main.py --port 8124 --reload

View File

@ -0,0 +1,5 @@
```
pip install maturin
maturing develop
```

View File

@ -1,6 +1,5 @@
import json
from collections import defaultdict
from typing import TypeVar
from pathlib import Path
Tree = dict[str, "Tree"]
@ -71,11 +70,13 @@ class CompressedTree():
k, *rest = path
return self._add_to_cache(RefcountedDict({k : self._cache_path(rest)}))
def reconstruct(self) -> dict[str, dict]:
def reconstruct(self, max_depth = None) -> dict[str, dict]:
"Reconstruct the tree as a normal nested dictionary"
def reconstruct_node(h : int) -> dict[str, dict]:
return {k : reconstruct_node(v) for k, v in self.cache[h].items()}
return reconstruct_node(self.root_hash)
def reconstruct_node(h : int, depth : int) -> dict[str, dict]:
if max_depth is not None and depth > max_depth:
return {}
return {k : reconstruct_node(v, depth+1) for k, v in self.cache[h].items()}
return reconstruct_node(self.root_hash, 0)
def reconstruct_compressed(self) -> dict[str, dict]:
"Reconstruct the tree as a normal nested dictionary"
@ -87,18 +88,18 @@ class CompressedTree():
return {"/".join(keys) : reconstruct_node(h) for h, keys in dedup.items()}
return reconstruct_node(self.root_hash)
def reconstruct_compressed_ecmwf_style(self) -> dict[str, dict]:
def reconstruct_compressed_ecmwf_style(self, max_depth=None, from_node=None) -> dict[str, dict]:
"Reconstruct the tree as a normal nested dictionary"
def reconstruct_node(h : int) -> dict[str, dict]:
def reconstruct_node(h : int, depth : int) -> dict[str, dict]:
if max_depth is not None and depth > max_depth:
return {}
dedup : dict[tuple[int, str], set[str]] = defaultdict(set)
for k, h2 in self.cache[h].items():
key, value = k.split("=")
dedup[(h2, key)].add(value)
return {f"{key}={','.join(values)}" : reconstruct_node(h) for (h, key), values in dedup.items()}
return reconstruct_node(self.root_hash)
return {f"{key}={','.join(values)}" : reconstruct_node(h, depth=depth+1) for (h, key), values in dedup.items()}
return reconstruct_node(from_node or self.root_hash, depth=0)
def __init__(self, tree : Tree):
self.cache = {}
@ -124,8 +125,8 @@ class CompressedTree():
h = loc[key] # get the hash of the subtree
loc = self.cache[h] # get the subtree
else:
return False, keys[:i]
return True, keys
return False, keys[:i], h
return True, keys, h
def keys(self, keys : tuple[str, ...] = ()) -> list[str] | None:
loc = self.tree

View File

@ -1,2 +1,2 @@
from . import rust as backend
from .CompressedTree import CompressedTree
from .CompressedTree import CompressedTree, RefcountedDict

View File

@ -0,0 +1,334 @@
#![allow(dead_code)]
use std::rc::Rc;
use smallstr::SmallString;
use slotmap::{new_key_type, SlotMap};
new_key_type! {
struct NodeId;
}
type CompactString = SmallString<[u8; 16]>;
#[derive(Clone)]
enum NodeValueTypes {
String(CompactString),
Int(i32),
}
impl From<&str> for NodeValueTypes {
fn from(s: &str) -> Self {
NodeValueTypes::String(CompactString::from(s))
}
}
impl From<i32> for NodeValueTypes {
fn from(i: i32) -> Self {
NodeValueTypes::Int(i)
}
}
enum NodeValue {
Single(NodeValueTypes),
Multiple(Vec<NodeValueTypes>),
}
struct Node<Payload> {
key: Rc<String>,
value: NodeValue,
parent: Option<NodeId>,
prev_sibling: Option<NodeId>,
next_sibling: Option<NodeId>,
// vector may be faster for traversal, but linkedlist should be faster for insertion
children: Option<(NodeId, NodeId)>, // (first_child, last_child)
data: Option<Payload>,
}
struct QueryTree<Payload> {
nodes: SlotMap<NodeId, Node<Payload>>,
}
impl<Payload> QueryTree<Payload> {
fn new() -> Self {
QueryTree {
nodes: SlotMap::with_key(),
}
}
// Adds a node with a key and single value
fn add_node<S>(&mut self, key: &Rc<String>, value: S, parent: Option<NodeId>) -> NodeId
where
S: Into<NodeValueTypes>,
{
let node_id = self.nodes.insert_with_key(|_| Node {
key: Rc::clone(key),
value: NodeValue::Single(value.into()),
parent,
prev_sibling: None,
next_sibling: None,
children: None,
data: None,
});
if let Some(parent_id) = parent {
// Determine if parent has existing children
if let Some((first_child_id, last_child_id)) = self.nodes[parent_id].children {
// Update the last child's `next_sibling`
{
let last_child = &mut self.nodes[last_child_id];
last_child.next_sibling = Some(node_id);
}
// Update the new node's `prev_sibling`
{
let new_node = &mut self.nodes[node_id];
new_node.prev_sibling = Some(last_child_id);
}
// Update parent's last child
let parent_node = &mut self.nodes[parent_id];
parent_node.children = Some((first_child_id, node_id));
} else {
// No existing children
let parent_node = &mut self.nodes[parent_id];
parent_node.children = Some((node_id, node_id));
}
}
node_id
}
// Add a single value to a node
fn add_value<S>(&mut self, node_id: NodeId, value: S)
where
S: Into<NodeValueTypes>,
{
if let Some(node) = self.nodes.get_mut(node_id) {
match &mut node.value {
NodeValue::Single(v) => {
let values = vec![v.clone(), value.into()];
node.value = NodeValue::Multiple(values);
}
NodeValue::Multiple(values) => {
values.push(value.into());
}
}
}
}
// Add multiple values to a node
fn add_values<S>(&mut self, node_id: NodeId, values: Vec<S>)
where
S: Into<NodeValueTypes>,
{
if let Some(node) = self.nodes.get_mut(node_id) {
match &mut node.value {
NodeValue::Single(v) => {
let mut new_values = vec![v.clone()];
new_values.extend(values.into_iter().map(|v| v.into()));
node.value = NodeValue::Multiple(new_values);
}
NodeValue::Multiple(existing_values) => {
existing_values.extend(values.into_iter().map(|v| v.into()));
}
}
}
}
fn get_node(&self, node_id: NodeId) -> Option<&Node<Payload>> {
self.nodes.get(node_id)
}
// TODO: better if this returns an iterator?
fn get_children(&self, node_id: NodeId) -> Vec<NodeId> {
let mut children = Vec::new();
if let Some(node) = self.get_node(node_id) {
if let Some((first_child_id, _)) = node.children {
let mut current_id = Some(first_child_id);
while let Some(cid) = current_id {
children.push(cid);
current_id = self.nodes[cid].next_sibling;
}
}
}
children
}
fn remove_node(&mut self, node_id: NodeId) {
// Remove the node and update parent and siblings
if let Some(node) = self.nodes.remove(node_id) {
// Update parent's children
if let Some(parent_id) = node.parent {
let parent_node = self.nodes.get_mut(parent_id).unwrap();
if let Some((first_child_id, last_child_id)) = parent_node.children {
if first_child_id == node_id && last_child_id == node_id {
// Node was the only child
parent_node.children = None;
} else if first_child_id == node_id {
// Node was the first child
parent_node.children = Some((node.next_sibling.unwrap(), last_child_id));
} else if last_child_id == node_id {
// Node was the last child
parent_node.children = Some((first_child_id, node.prev_sibling.unwrap()));
}
}
}
// Update siblings
if let Some(prev_id) = node.prev_sibling {
self.nodes[prev_id].next_sibling = node.next_sibling;
}
if let Some(next_id) = node.next_sibling {
self.nodes[next_id].prev_sibling = node.prev_sibling;
}
// Recursively remove children
let children_ids = self.get_children(node_id);
for child_id in children_ids {
self.remove_node(child_id);
}
}
}
fn is_root(&self, node_id: NodeId) -> bool {
self.nodes[node_id].parent.is_none()
}
fn is_leaf(&self, node_id: NodeId) -> bool {
self.nodes[node_id].children.is_none()
}
fn add_payload(&mut self, node_id: NodeId, payload: Payload) {
if let Some(node) = self.nodes.get_mut(node_id) {
node.data = Some(payload);
}
}
fn print_tree(&self) {
// Find all root nodes (nodes without a parent)
let roots: Vec<NodeId> = self
.nodes
.iter()
.filter_map(|(id, node)| {
if node.parent.is_none() {
Some(id)
} else {
None
}
})
.collect();
// Iterate through each root node and print its subtree
for (i, root_id) in roots.iter().enumerate() {
let is_last = i == roots.len() - 1;
self.print_node(*root_id, String::new(), is_last);
}
}
/// Recursively prints a node and its children.
///
/// - `node_id`: The current node's ID.
/// - `prefix`: The string prefix for indentation and branch lines.
/// - `is_last`: Boolean indicating if the node is the last child of its parent.
fn print_node(&self, node_id: NodeId, prefix: String, is_last: bool) {
// Retrieve the current node
let node = match self.nodes.get(node_id) {
Some(n) => n,
None => return, // Node not found; skip
};
// Determine the branch character
let branch = if prefix.is_empty() {
"" // Root node doesn't have a branch
} else if is_last {
"└── " // Last child
} else {
"├── " // Middle child
};
// Print the current node's key and values
print!("{}{}{}", prefix, branch, node.key);
match &node.value {
NodeValue::Single(v) => match v {
NodeValueTypes::String(s) => println!(": ({})", s),
NodeValueTypes::Int(i) => println!(": ({})", i),
},
NodeValue::Multiple(vs) => {
let values: Vec<String> = vs
.iter()
.map(|v| match v {
NodeValueTypes::String(s) => s.to_string(),
NodeValueTypes::Int(i) => i.to_string(),
})
.collect();
println!(": ({})", values.join(", "));
}
}
// Prepare the prefix for child nodes
let new_prefix = if prefix.is_empty() {
if is_last {
" ".to_string()
} else {
"".to_string()
}
} else {
if is_last {
format!("{} ", prefix)
} else {
format!("{}", prefix)
}
};
// Retrieve and iterate through child nodes
if let Some((_first_child_id, _last_child_id)) = node.children {
let children = self.get_children(node_id);
let total = children.len();
for (i, child_id) in children.iter().enumerate() {
let child_is_last = i == total - 1;
self.print_node(*child_id, new_prefix.clone(), child_is_last);
}
}
}
}
fn main() {
let mut tree: QueryTree<i16> = QueryTree::new();
let value = "hello";
let axis = Rc::new("foo".to_string());
let root_id = tree.add_node(&axis, value, None);
use std::time::Instant;
let now = Instant::now();
for _ in 0..100 {
// let child_value = format!("child_val{}", i);
let child_id = tree.add_node(&axis, value, Some(root_id));
// tree.add_value(child_id, value);
for _ in 0..100 {
// let gchild_value = format!("gchild_val{}", j);
let gchild_id = tree.add_node(&axis, value, Some(child_id));
// tree.add_values(gchild_id, vec![1, 2]);
for _ in 0..1000 {
// let ggchild_value = format!("ggchild_val{}", k);
let _ggchild_id = tree.add_node(&axis, value, Some(gchild_id));
// tree.add_value(_ggchild_id, value);
// tree.add_values(_ggchild_id, vec![1, 2, 3, 4]);
}
}
}
assert_eq!(tree.nodes.len(), 10_010_101);
let elapsed = now.elapsed();
println!("Elapsed: {:.2?}", elapsed);
// tree.print_tree();
}

View File

@ -1 +1 @@
flask run --debug --port=5005
flask run --debug --port=5006

View File

@ -73,9 +73,13 @@ function goToNextUrl() {
values.push(timePicker.value.replace(":", ""));
}
const enum_checkboxes = item.querySelectorAll("input[type='checkbox']:checked");
const enum_checkboxes = item.querySelectorAll(
"input[type='checkbox']:checked"
);
if (enum_checkboxes.length > 0) {
values.push(...Array.from(enum_checkboxes).map((checkbox) => checkbox.value));
values.push(
...Array.from(enum_checkboxes).map((checkbox) => checkbox.value)
);
}
const any = item.querySelector("input[type='text']");
@ -104,7 +108,7 @@ function goToNextUrl() {
);
if (existingIndex !== -1) {
// If the key already exists,
// If the key already exists,
// and the values aren't already in there,
// append the values
request[existingIndex][1] = [...request[existingIndex][1], ...values];
@ -125,7 +129,6 @@ async function createCatalogItem(link, itemsContainer) {
itemsContainer.appendChild(itemDiv);
try {
// Update the item div with real content
itemDiv.classList.remove("loading");
@ -134,16 +137,19 @@ async function createCatalogItem(link, itemsContainer) {
// add data-key attribute to the itemDiv
itemDiv.dataset.key = link.title;
itemDiv.dataset.keyType = dimension.type;
itemDiv.innerHTML = `
<h3 class="item-title">${link.title || "No title available"}</h3>
<p class="item-type">Key Type: ${itemDiv.dataset.keyType || "Unknown"}</p>
<!-- <p class="item-type">Paths: ${dimension.paths}</p> -->
<p class="item-type">Optional: ${dimension.optional ? "Yes" : "No"}</p>
<p class="item-description">${dimension.description ? dimension.description.slice(0, 100) : "No description available"}...</p>
<p class="item-description">${
dimension.description
? dimension.description.slice(0, 100)
: "No description available"
}...</p>
`;
// if (dimension.type === "date" || dimension.type === "time") {
// // Render a date picker for the "date" key
// const picker = `<input type="${link.title}" name="${link.title}">`;
@ -155,7 +161,7 @@ async function createCatalogItem(link, itemsContainer) {
// }
// Otherwise create a scrollable list with checkboxes for values if available
if (
// dimension.type === "enum" &&
// dimension.type === "enum" &&
dimension.values &&
dimension.values.length > 0
) {
@ -173,20 +179,24 @@ async function createCatalogItem(link, itemsContainer) {
}
function renderCheckboxList(link) {
const dimension = link["generalized_datacube:dimension"];
const value_descriptions = dimension.value_descriptions || [];
const listContainerHTML = `
const dimension = link["generalized_datacube:dimension"];
const value_descriptions = dimension.value_descriptions || [];
const listContainerHTML = `
<div class="item-list-container">
<label class="list-label">Select one or more values:</label>
<div class="scrollable-list">
${dimension.values
.map((value, index) => {
const labelText = value_descriptions[index] ? `${value} - ${value_descriptions[index]}` : value;
const labelText = value_descriptions[index]
? `${value} - ${value_descriptions[index]}`
: value;
return `
<div class="checkbox-container">
<label class="checkbox-label">
<input type="checkbox" class="item-checkbox" value="${value}" ${dimension.values.length === 1? 'checked' : ''}>
<input type="checkbox" class="item-checkbox" value="${value}" ${
dimension.values.length === 1 ? "checked" : ""
}>
${labelText}
</label>
</div>
@ -196,9 +206,10 @@ function renderCheckboxList(link) {
</div>
</div>
`;
return document.createRange().createContextualFragment(listContainerHTML).firstElementChild;
}
return document.createRange().createContextualFragment(listContainerHTML)
.firstElementChild;
}
// Render catalog items in the sidebar
function renderCatalogItems(links) {
@ -217,36 +228,37 @@ function renderCatalogItems(links) {
}
function renderRequestBreakdown(request, descriptions) {
const container = document.getElementById("request-breakdown");
const format_value = (key, value) => {
return `<span class="value" title="${descriptions[key]['value_descriptions'][value]}">"${value}"</span>`;
};
const format_values = (key, values) => {
if (values.length === 1) {
return format_value(key, values[0]);
}
return `[${values.map((v) =>
format_value(key, v)
).join(", ")}]`;
};
let html = `{\n` +
request
.map(
([key, values]) =>
` <span class="key" title="${descriptions[key]['description']}">"${key}"</span>: ${format_values(key, values)},`
)
.join("\n") +
`\n}`;
container.innerHTML = html;
}
const container = document.getElementById("request-breakdown");
const format_value = (key, value) => {
return `<span class="value" title="${descriptions[key]["value_descriptions"][value]}">"${value}"</span>`;
};
const format_values = (key, values) => {
if (values.length === 1) {
return format_value(key, values[0]);
}
return `[${values.map((v) => format_value(key, v)).join(", ")}]`;
};
let html =
`{\n` +
request
.map(
([key, values]) =>
` <span class="key" title="${
descriptions[key]["description"]
}">"${key}"</span>: ${format_values(key, values)},`
)
.join("\n") +
`\n}`;
container.innerHTML = html;
}
function renderRawSTACResponse(catalog) {
const itemDetails = document.getElementById("raw-stac");
// create new object without debug key
let just_stac = Object.assign({}, catalog);
delete just_stac.debug;
let just_stac = Object.assign({}, catalog);
delete just_stac.debug;
itemDetails.textContent = JSON.stringify(just_stac, null, 2);
const debug_container = document.getElementById("debug");