xarray-contrib · jemmajeffree · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/flox/_version.py b/flox/_version.py
@@ -0,0 +1 @@
+__version__ = "0.1.dev657+g619a390.d20250606"
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
@@ -5,6 +5,99 @@
 from . import xrdtypes as dtypes
 from .xrutils import is_scalar, isnull, notnull
 
+MULTIARRAY_HANDLED_FUNCTIONS = {}
+
+
+class MultiArray:
+    arrays: tuple[np.ndarray, ...]
+
+    def __init__(self, arrays):
+        self.arrays = arrays  # something else needed here to be more careful about types (not sure what)
+        # Do we want to co-erce arrays into a tuple and make sure it's immutable? Do we want it to be immutable?
+        assert np.all([arrays[0].shape == a.shape for a in arrays]), (
+            "Expect all arrays to have the same shape"
+        )
+
+    def astype(self, dt, **kwargs):
+        new_arrays = []  # I really don't like doing this as a list
+        for array in self.arrays:  # Do we care about trying to avoid for loops here? three separate lines would be faster, but harder to read
+            new_arrays.append(array.astype(dt, **kwargs))
+        return MultiArray(new_arrays)
+
+    def reshape(self, shape, **kwargs):
+        return MultiArray([array.reshape(shape, **kwargs) for array in self.arrays])
+
+    def squeeze(self, axis=None):
+        return MultiArray([array.squeeze(axis) for array in self.arrays])
+
+    def __array_function__(self, func, types, args, kwargs):
+        if func not in MULTIARRAY_HANDLED_FUNCTIONS:
+            return NotImplemented
+        # Note: this allows subclasses that don't override
+        # __array_function__ to handle MyArray objects
+        # if not all(issubclass(t, MyArray) for t in types): # I can't see this being relevant at all for this code, but maybe it's safer to leave it in?
+        # return NotImplemented
+        return MULTIARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs)
+
+    # Shape is needed, seems likely that the other two might be
+    # Making some strong assumptions here that all the arrays are the same shape, and I don't really like this
+    @property
+    def dtype(self) -> np.dtype:
+        return self.arrays[0].dtype
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        return self.arrays[0].shape
+
+    @property
+    def ndim(self) -> int:
+        return self.arrays[0].ndim
+
+
+def implements(numpy_function):
+    """Register an __array_function__ implementation for MyArray objects."""
+
+    def decorator(func):
+        MULTIARRAY_HANDLED_FUNCTIONS[numpy_function] = func
+        return func
+
+    return decorator
+
+
+@implements(np.expand_dims)
+def expand_dims_MultiArray(multiarray, axis):
+    return MultiArray(
+        [np.expand_dims(a, axis) for a in multiarray.arrays]
+    )  # This is gonna spit out a list and I'm not sure if I'm okay with that?
+
+
+@implements(np.concatenate)
+def concatenate_MultiArray(multiarrays, axis):
+    n_arrays = len(multiarrays[0].arrays)
+    for ma in multiarrays[1:]:
+        if not (
+            len(ma.arrays) == n_arrays
+        ):  # I don't know what trying to concatenate MultiArrays with different numbers of arrays would even mean
+            raise NotImplementedError
+
+    # There's the potential for problematic different shapes coming in here.
+    # Probably warrants some defensive programming, but I'm not sure what to check for while still being generic
+
+    # I don't like using append and lists here, but I can't work out how to do it better
+    new_arrays = []
+    for i in range(multiarrays[0].ndim):
+        new_arrays.append(np.concatenate([ma.arrays[i] for ma in multiarrays], axis))
+
+    out = MultiArray(new_arrays)
+    return out
+
+
+@implements(np.transpose)
+def transpose_MultiArray(multiarray, axes):
+    return MultiArray(
+        [np.transpose(a, axes) for a in multiarray.arrays]
+    )  # This is gonna spit out a list and I'm not sure if I'm okay with that?
+
 
 def _prepare_for_flox(group_idx, array):
     """
@@ -251,6 +344,43 @@ def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None
     return out
 
 
+def var_chunk(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
+    # Calculate length and sum - important for the adjustment terms to sum squared deviations
+    array_lens = nanlen(
+        group_idx,
+        array,
+        axis=axis,
+        size=size,
+        fill_value=fill_value,
+        dtype=dtype,
+    )
+
+    array_sums = sum(
+        group_idx,
+        array,
+        axis=axis,
+        size=size,
+        fill_value=fill_value,
+        dtype=dtype,
+    )
+
+    # Calculate sum squared deviations - the main part of variance sum
+    array_means = (
+        array_sums / array_lens
+    )  # Does this risk being run eagerly because it's not wrapped in anything?
+
+    sum_squared_deviations = sum(
+        group_idx,
+        (array - array_means[..., group_idx]) ** 2,
+        axis=axis,
+        size=size,
+        fill_value=fill_value,
+        dtype=dtype,
+    )
+
+    return MultiArray((sum_squared_deviations, array_sums, array_lens))
+
+
 def ffill(group_idx, array, *, axis, **kwargs):
     group_idx, array, perm = _prepare_for_flox(group_idx, array)
     shape = array.shape

diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -343,12 +343,61 @@ def _mean_finalize(sum_, count):
 )
 
 
+def _var_combine(array, axis, keepdims=True):
+    def clip_last(array):
+        """Return array except the last element along axis
+        Purely included to tidy up the adj_terms line
+        """
+        not_last = [slice(None, None) for i in range(array.ndim)]
+        not_last[axis[0]] = slice(None, -1)
+        return array[*not_last]
+
+    def clip_first(array):
+        """Return array except the first element along axis
+        Purely included to tidy up the adj_terms line
+        """
+        not_first = [slice(None, None) for i in range(array.ndim)]
+        not_first[axis[0]] = slice(1, None)
+        return array[*not_first]
+
+    assert len(axis) == 1, "Assuming that the combine function is only in one direction at once"
+
+    # Does this double our memory footprint or are they just views?
+    # If there's a huge memory impact, probably better to copy paste array.arrays[1]
+    # in and accept the hit to readability
+    sum_deviations = array.arrays[0]
+    sum_X = array.arrays[1]
+    sum_len = array.arrays[2]
+
+    # Calculate parts needed for cascading combination
+    cumsum_X = np.cumsum(sum_X, axis=axis[0])  # Don't need to be able to merge the last element
+    cumsum_len = np.cumsum(sum_len, axis=axis[0])
+
+    # Adjustment terms to tweak the sum of squared deviations because not every chunk has the same mean
+    adj_terms = (
+        clip_last(cumsum_len) * clip_first(sum_X) - clip_first(sum_len) * clip_last(cumsum_X)
+    ) ** 2 / (clip_last(cumsum_len) * clip_first(sum_len) * (clip_last(cumsum_len) + clip_first(sum_len)))
+
+    return aggregate_flox.MultiArray(
+        (
+            np.sum(sum_deviations, axis=axis, keepdims=keepdims)
+            + np.sum(adj_terms, axis=axis, keepdims=keepdims),  # sum of squared deviations
+            np.sum(sum_X, axis=axis, keepdims=keepdims),  # sum of array items
+            np.sum(sum_len, axis=axis, keepdims=keepdims),  # sum of array lengths
+        )
+    )  # I'm not even pretending calling this class from there is a good idea, I think it wants to be somewhere else though
+
+
 # TODO: fix this for complex numbers
-def _var_finalize(sumsq, sum_, count, ddof=0):
-    with np.errstate(invalid="ignore", divide="ignore"):
-        result = (sumsq - (sum_**2 / count)) / (count - ddof)
-    result[count <= ddof] = np.nan
-    return result
+# def _var_finalize(sumsq, sum_, count, ddof=0):
+# with np.errstate(invalid="ignore", divide="ignore"):
+# result = (sumsq - (sum_**2 / count)) / (count - ddof)
+# result[count <= ddof] = np.nan
+# return result
+
+
+def _var_finalize(multiarray, ddof=0):
+    return multiarray.arrays[0] / (multiarray.arrays[2] - ddof)  # Is this how ddof works again???
 
 
 def _std_finalize(sumsq, sum_, count, ddof=0):
@@ -366,14 +415,24 @@ def _std_finalize(sumsq, sum_, count, ddof=0):
     dtypes=(None, None, np.intp),
     final_dtype=np.floating,
 )
+# nanvar = Aggregation(
+# "nanvar",
+# chunk=("nansum_of_squares", "nansum", "nanlen"),
+# combine=("sum", "sum", "sum"),
+# finalize=_var_finalize,
+# fill_value=0,
+# final_fill_value=np.nan,
+# dtypes=(None, None, np.intp),
+# final_dtype=np.floating,
+# )
 nanvar = Aggregation(
     "nanvar",
-    chunk=("nansum_of_squares", "nansum", "nanlen"),
-    combine=("sum", "sum", "sum"),
+    chunk=("var_chunk"),
+    combine=(_var_combine,),
     finalize=_var_finalize,
     fill_value=0,
     final_fill_value=np.nan,
-    dtypes=(None, None, np.intp),
+    dtypes=(None,),
     final_dtype=np.floating,
 )
 std = Aggregation(
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		__version__ = "0.1.dev657+g619a390.d20250606"
dcherian marked this conversation as resolved. Show resolved Hide resolved