time-to-botec/squiggle/node_modules/@stdlib/ndarray/base/unary/src/dispatch.c

/**
* @license Apache-2.0
*
* Copyright (c) 2021 The Stdlib Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "stdlib/ndarray/base/unary/dispatch_object.h"
#include "stdlib/ndarray/base/unary/typedefs.h"
#include "stdlib/ndarray/base/iteration_order.h"
#include "stdlib/ndarray/base/bytes_per_element.h"
#include "stdlib/ndarray/ctor.h"
#include <stdint.h>
#include <stddef.h>

/**
* Applies a unary callback to an n-dimensional input ndarray having `ndims-1` singleton dimensions and assigns results to elements in an output ndarray having the same shape.
*
* ## Notes
*
* -   If able to successfully apply a unary callback, the function returns `0`; otherwise, the function returns an error code.
*
* @private
* @param f     unary ndarray function
* @param x1    input ndarray
* @param x2    output ndarray
* @param i     index of the non-singleton dimension
* @param fcn   callback
* @return      status code
*/
static int8_t stdlib_ndarray_unary_1d_squeeze( const ndarrayUnaryFcn f, struct ndarray *x1, struct ndarray *x2, const int64_t i, void *fcn ) {
	int64_t sh[] = { stdlib_ndarray_shape( x1 )[ i ] };

	// Shallow copy and reshape the arrays...
	int64_t sx1[] = { stdlib_ndarray_strides( x1 )[ i ] };
	struct ndarray *x1c = stdlib_ndarray_allocate(
		stdlib_ndarray_dtype( x1 ),
		stdlib_ndarray_data( x1 ),
		1,
		sh,
		sx1,
		stdlib_ndarray_offset( x1 ),
		stdlib_ndarray_order( x1 ),
		stdlib_ndarray_index_mode( x1 ),
		stdlib_ndarray_nsubmodes( x1 ),
		stdlib_ndarray_submodes( x1 )
	);
	if ( x1c == NULL ) {
		return -1;
	}
	int64_t sx2[] = { stdlib_ndarray_strides( x2 )[ i ] };
	struct ndarray *x2c = stdlib_ndarray_allocate(
		stdlib_ndarray_dtype( x2 ),
		stdlib_ndarray_data( x2 ),
		1,
		sh,
		sx2,
		stdlib_ndarray_offset( x2 ),
		stdlib_ndarray_order( x2 ),
		stdlib_ndarray_index_mode( x2 ),
		stdlib_ndarray_nsubmodes( x2 ),
		stdlib_ndarray_submodes( x2 )
	);
	if ( x2c == NULL ) {
		stdlib_ndarray_free( x1c );
		return -1;
	}
	// Apply the callback:
	struct ndarray *arrays[] = { x1c, x2c };
	int8_t status = f( arrays, fcn );

	// Free allocated memory:
	stdlib_ndarray_free( x1c );
	stdlib_ndarray_free( x2c );

	return status;
}

/**
* Applies a unary callback to a flattened n-dimensional input ndarray and assigns results to elements in a flattened n-dimensional output ndarray.
*
* ## Notes
*
* -   If able to successfully apply a unary callback, the function returns `0`; otherwise, the function returns an error code.
*
* @private
* @param f     unary ndarray function
* @param N     number of elements
* @param x1    input ndarray
* @param s1    input ndarray stride length
* @param x2    output ndarray
* @param s2    output ndarray stride length
* @param fcn   callback
* @return      status code
*/
static int8_t stdlib_ndarray_unary_1d_flatten( const ndarrayUnaryFcn f, const int64_t N, struct ndarray *x1, const int64_t s1, struct ndarray *x2, const int64_t s2, void *fcn ) {
	// Define the (flattened) strided array shape:
	int64_t sh[] = { N };

	// Shallow copy and reshape the arrays...
	int64_t sx1[] = { s2 };
	struct ndarray *x1c = stdlib_ndarray_allocate(
		stdlib_ndarray_dtype( x1 ),
		stdlib_ndarray_data( x1 ),
		1,
		sh,
		sx1,
		stdlib_ndarray_offset( x1 ),
		stdlib_ndarray_order( x1 ),
		stdlib_ndarray_index_mode( x1 ),
		stdlib_ndarray_nsubmodes( x1 ),
		stdlib_ndarray_submodes( x1 )
	);
	if ( x1c == NULL ) {
		return -1;
	}
	int64_t sx2[] = { s2 };
	struct ndarray *x2c = stdlib_ndarray_allocate(
		stdlib_ndarray_dtype( x2 ),
		stdlib_ndarray_data( x2 ),
		1,
		sh,
		sx2,
		stdlib_ndarray_offset( x2 ),
		stdlib_ndarray_order( x2 ),
		stdlib_ndarray_index_mode( x2 ),
		stdlib_ndarray_nsubmodes( x2 ),
		stdlib_ndarray_submodes( x2 )
	);
	if ( x2c == NULL ) {
		stdlib_ndarray_free( x1c );
		return -1;
	}
	// Apply the callback:
	struct ndarray *arrays[] = { x1c, x2c };
	int8_t status = f( arrays, fcn );

	// Free allocated memory:
	stdlib_ndarray_free( x1c );
	stdlib_ndarray_free( x2c );

	return status;
}

/**
* Dispatches to a unary ndarray function according to the dimensionality of provided ndarray arguments.
*
* ## Notes
*
* -   If able to successfully dispatch, the function returns `0`; otherwise, the function returns an error code.
*
* @param obj      object comprised of dispatch tables containing unary ndarray functions
* @param arrays   array whose first element is a pointer to an input ndarray and whose last element is a pointer to an output ndarray
* @param fcn      callback
* @return         status code
*
* @example
* #include "stdlib/ndarray/base/unary/dispatch.h"
* #include "stdlib/ndarray/base/unary/dispatch_object.h"
* #include "stdlib/ndarray/base/unary/typedefs.h"
* #include "stdlib/ndarray/base/unary/b_b.h"
* #include "stdlib/ndarray/ctor.h"
* #include <stdint.h>
* #include <stdlib.h>
* #include <stdio.h>
*
* // Define a list of unary ndarray functions:
* ndarrayUnaryFcn functions[] = {
*     stdlib_ndarray_b_b_0d,
*     stdlib_ndarray_b_b_1d,
*     stdlib_ndarray_b_b_2d,
*     stdlib_ndarray_b_b_3d,
*     stdlib_ndarray_b_b_4d,
*     stdlib_ndarray_b_b_5d,
*     stdlib_ndarray_b_b_6d,
*     stdlib_ndarray_b_b_7d,
*     stdlib_ndarray_b_b_8d,
*     stdlib_ndarray_b_b_9d,
*     stdlib_ndarray_b_b_10d
*     stdlib_ndarray_b_b_nd
* };
*
* // Define a list of unary ndarray functions using loop blocking...
* ndarrayUnaryFcn blocked_functions[] = {
*     stdlib_ndarray_b_b_2d_blocked,
*     stdlib_ndarray_b_b_3d_blocked,
*     stdlib_ndarray_b_b_4d_blocked,
*     stdlib_ndarray_b_b_5d_blocked,
*     stdlib_ndarray_b_b_6d_blocked,
*     stdlib_ndarray_b_b_7d_blocked,
*     stdlib_ndarray_b_b_8d_blocked,
*     stdlib_ndarray_b_b_9d_blocked,
*     stdlib_ndarray_b_b_10d_blocked
* };
*
* // Create a unary function dispatch object:
* struct ndarrayUnaryDispatchObject obj = {
*     // Array containing unary ndarray functions:
*     unary,
*
*     // Number of unary ndarray functions:
*     12,
*
*     // Array containing unary ndarray functions using loop blocking:
*     blocked_unary,
*
*     // Number of unary ndarray functions using loop blocking:
*     9
* }
*
* // ...
*
* // Create ndarrays...
* struct ndarray *x = stdlib_ndarray_allocate( ... );
* if ( x == NULL ) {
*     fprintf( stderr, "Error allocating memory.\n" );
*     exit( EXIT_FAILURE );
* }
*
* struct ndarray *y = stdlib_ndarray_allocate( ... );
* if ( y == NULL ) {
*     fprintf( stderr, "Error allocating memory.\n" );
*     exit( EXIT_FAILURE );
* }
*
* // ...
*
* // Define a callback:
* uint8_t scale( const uint8_t x ) {
*     return x + 10;
* }
*
* // Apply the callback:
* struct ndarray *arrays[] = { x, y };
* int8_t status = stdlib_ndarray_b_b( arrays, (void *)scale );
* if ( status != 0 ) {
*     fprintf( stderr, "Error during computation.\n" );
*     exit( EXIT_FAILURE );
* }
*/
int8_t stdlib_ndarray_unary_dispatch( const struct ndarrayUnaryDispatchObject *obj, struct ndarray *arrays[], void *fcn ) {
	struct ndarray *x1;
	struct ndarray *x2;
	int8_t status;
	int64_t ndims;
	int64_t mab1;
	int64_t mab2;
	int64_t mib1;
	int64_t mib2;
	int64_t *sh1;
	int64_t *sh2;
	int64_t *s1;
	int64_t *s2;
	int64_t len;
	int64_t bp1;
	int64_t bp2;
	int8_t io1;
	int8_t io2;
	int64_t ns;
	int64_t s;
	int64_t d;
	int64_t i;

	// Unpack the arrays:
	x1 = arrays[ 0 ];
	x2 = arrays[ 1 ];

	// Verify that the input and output arrays have the same number of dimensions...
	ndims = stdlib_ndarray_ndims( x1 );
	if ( ndims != stdlib_ndarray_ndims( x2 ) ) {
		return -1; // TODO: consider standardized error codes
	}
	// Determine whether we can avoid iteration altogether...
	if ( ndims == 0 ) {
		obj->functions[ 0 ]( arrays, fcn );
		return 0;
	}
	sh1 = stdlib_ndarray_shape( x1 );
	sh2 = stdlib_ndarray_shape( x2 );

	// Verify that the input and output arrays have the same dimensions...
	len = 1; // number of elements
	ns = 0;  // number of singleton dimensions
	for ( i = 0; i < ndims; i++ ) {
		d = sh1[ i ];
		if ( d != sh2[ i ] ) {
			return -1; // TODO: consider standardized error codes
		}
		// Note that, if one of the dimensions is `0`, the length will be `0`...
		len *= d;

		// Check whether the current dimension is a singleton dimension...
		if ( d == 1 ) {
			ns += 1;
		}
	}
	// Check whether we were provided empty ndarrays...
	if ( len == 0 ) {
		return 0;
	}
	// Determine whether the ndarrays are one-dimensional and thus readily translate to one-dimensional strided arrays...
	if ( ndims == 1 ) {
		obj->functions[ 1 ]( arrays, fcn );
		return 0;
	}
	// Determine whether the ndarrays have only **one** non-singleton dimension (e.g., ndims=4, shape=[10,1,1,1]) so that we can treat the ndarrays as being equivalent to one-dimensional strided arrays...
	if ( ns == ndims-1 ) {
		// Get the index of the non-singleton dimension...
		for ( i = 0; i < ndims; i++ ) {
			if ( sh1[ i ] != 1 ) {
				break;
			}
		}
		// Remove the singleton dimensions and apply the unary function...
		status = stdlib_ndarray_unary_1d_squeeze( obj->functions[ 1 ], x1, x2, i, fcn );
		if ( status == 0 ) {
			return 0;
		}
		// If we failed, this is probably due to failed memory allocation, so fall through and try again...
	}
	s1 = stdlib_ndarray_strides( x1 );
	s2 = stdlib_ndarray_strides( x2 );
	io1 = stdlib_ndarray_iteration_order( ndims, s1 ); // +/-1
	io2 = stdlib_ndarray_iteration_order( ndims, s2 ); // +/-1

	// Determine whether we can avoid blocked iteration...
	if ( io1 != 0 && io2 != 0 && stdlib_ndarray_order( x1 ) == stdlib_ndarray_order( x2 )) {
		// Determine the minimum and maximum linear byte indices which are accessible by the array views:
		mib1 = stdlib_ndarray_offset( x1 ); // byte offset
		mab1 = mib1;
		mib2 = stdlib_ndarray_offset( x2 ); // byte offset
		mab2 = mib2;
		for ( i = 0; i < ndims; i++ ) {
			s = s1[ i ]; // units: bytes
			if ( s > 0 ) {
				mab1 += s * ( sh1[i]-1 );
			} else if ( s < 0 ) {
				mib1 += s * ( sh1[i]-1 ); // decrements
			}
			s = s2[ i ];
			if ( s > 0 ) {
				mab2 += s * ( sh2[i]-1 );
			} else if ( s < 0 ) {
				mib2 += s * ( sh2[i]-1 ); // decrements
			}
		}
		bp1 = stdlib_ndarray_bytes_per_element( stdlib_ndarray_dtype( x1 ) );
		bp2 = stdlib_ndarray_bytes_per_element( stdlib_ndarray_dtype( x2 ) );

		// Determine whether we can ignore shape (and strides) and treat the ndarrays as linear one-dimensional strided arrays...
		if ( ( len*bp1 ) == ( mab1-mib1+bp1 ) && ( len*bp2 ) == ( mab2-mib2+bp2 ) ) {
			// Note: the above is equivalent to @stdlib/ndarray/base/assert/is-contiguous, but in-lined so we can retain computed values...
			status = stdlib_ndarray_unary_1d_flatten( obj->functions[ 1 ], len, x1, io1*bp1, x2, io2*bp2, fcn );
			if ( status == 0 ) {
				return 0;
			}
			// If we failed, this is probably due to failed memory allocation, so fall through and try again...
		}
		// At least one ndarray is non-contiguous, so we cannot directly use one-dimensional array functionality...

		// Determine whether we can use simple nested loops...
		if ( ndims < (obj->nfunctions) ) {
			// So long as iteration for each respective array always moves in the same direction (i.e., no mixed sign strides), we can leverage cache-optimal (i.e., normal) nested loops without resorting to blocked iteration...
			obj->functions[ ndims ]( arrays, fcn );
			return 0;
		}
		// Fall-through to blocked iteration...
	}
	// At this point, we're either dealing with non-contiguous n-dimensional arrays, high dimensional n-dimensional arrays, and/or arrays having differing memory layouts, so our only hope is that we can still perform blocked iteration...

	// Determine whether we can perform blocked iteration...
	if ( ndims <= (obj->nblockedfunctions)+1 ) {
		obj->blocked_functions[ ndims-2 ]( arrays, fcn );
		return 0;
	}
	// Fall-through to linear view iteration without regard for how data is stored in memory (i.e., take the slow path)...
	obj->functions[ (obj->nfunctions)-1 ]( arrays, fcn );

	return 0;
}