/* Copyright 2014 The ChromiumOS Authors
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

/* Common math functions. */

#include "common.h"
#include "math.h"
#include "math_util.h"
#include "util.h"

/* For cosine lookup table, define the increment and the size of the table. */
#define COSINE_LUT_INCR_DEG	5
#define COSINE_LUT_SIZE		((180 / COSINE_LUT_INCR_DEG) + 1)

/* Lookup table for the value of cosine from 0 degrees to 180 degrees. */
static const fp_t cos_lut[] = {
	FLOAT_TO_FP( 1.00000), FLOAT_TO_FP( 0.99619), FLOAT_TO_FP( 0.98481),
	FLOAT_TO_FP( 0.96593), FLOAT_TO_FP( 0.93969), FLOAT_TO_FP( 0.90631),
	FLOAT_TO_FP( 0.86603), FLOAT_TO_FP( 0.81915), FLOAT_TO_FP( 0.76604),
	FLOAT_TO_FP( 0.70711), FLOAT_TO_FP( 0.64279), FLOAT_TO_FP( 0.57358),
	FLOAT_TO_FP( 0.50000), FLOAT_TO_FP( 0.42262), FLOAT_TO_FP( 0.34202),
	FLOAT_TO_FP( 0.25882), FLOAT_TO_FP( 0.17365), FLOAT_TO_FP( 0.08716),
	FLOAT_TO_FP( 0.00000), FLOAT_TO_FP(-0.08716), FLOAT_TO_FP(-0.17365),
	FLOAT_TO_FP(-0.25882), FLOAT_TO_FP(-0.34202), FLOAT_TO_FP(-0.42262),
	FLOAT_TO_FP(-0.50000), FLOAT_TO_FP(-0.57358), FLOAT_TO_FP(-0.64279),
	FLOAT_TO_FP(-0.70711), FLOAT_TO_FP(-0.76604), FLOAT_TO_FP(-0.81915),
	FLOAT_TO_FP(-0.86603), FLOAT_TO_FP(-0.90631), FLOAT_TO_FP(-0.93969),
	FLOAT_TO_FP(-0.96593), FLOAT_TO_FP(-0.98481), FLOAT_TO_FP(-0.99619),
	FLOAT_TO_FP(-1.00000),
};
BUILD_ASSERT(ARRAY_SIZE(cos_lut) == COSINE_LUT_SIZE);


fp_t arc_cos(fp_t x)
{
	int i;

	/* Cap x if out of range. */
	if (x < FLOAT_TO_FP(-1.0))
		x = FLOAT_TO_FP(-1.0);
	else if (x > FLOAT_TO_FP(1.0))
		x = FLOAT_TO_FP(1.0);

	/*
	 * Increment through lookup table to find index and then linearly
	 * interpolate for precision.
	 */
	/* TODO(crosbug.com/p/25600): Optimize with binary search. */
	for (i = 0; i < COSINE_LUT_SIZE-1; i++) {
		if (x >= cos_lut[i+1]) {
			const fp_t interp = fp_div(cos_lut[i] - x,
						   cos_lut[i] - cos_lut[i + 1]);

			return fp_mul(INT_TO_FP(COSINE_LUT_INCR_DEG),
				      INT_TO_FP(i) + interp);
		}
	}

	/*
	 * Shouldn't be possible to get here because inputs are clipped to
	 * [-1, 1] and the cos_lut[] table goes over the same range. If we
	 * are here, throw an assert.
	 */
	ASSERT(0);

	return 0;
}

/**
 * Integer square root.
 */
#ifdef CONFIG_FPU
/*
 * Use library sqrtf instruction, if available, since it's usually much faster
 * and smaller. On Cortex-M4, this becomes a single instruction which takes
 * 14 cycles to execute. This produces identical results to binary search,
 * except when the floating point representation of the square root rounds up
 * to an integer.
 */
inline int int_sqrtf(fp_inter_t x)
{
	return sqrtf(x);
}

/* If the platform support FPU, just return sqrtf. */
fp_t fp_sqrtf(fp_t x)
{
	return sqrtf(x);
}
#else
int int_sqrtf(fp_inter_t x)
{
	int rmax = INT32_MAX;
	int rmin = 0;

	/*
	 * Short cut if x is 32-bit value
	 * sqrt(INT32_MAX) ~= 46340.95
	 */
	if (x < INT32_MAX)
		rmax = 46341;

	/*
	 * Just binary-search.  There are better algorithms, but we call this
	 * infrequently enough it doesn't matter.
	 */
	if (x <= 0)
		return 0;  /* Yeah, for imaginary numbers too */
	else if (x >= (fp_inter_t)rmax * rmax)
		return rmax;

	while (1) {
		int r = rmin + (rmax - rmin) / 2;
		fp_inter_t r2 = (fp_inter_t)r * r;

		if (r2 > x) {
			/* Guessed too high */
			rmax = r;
		} else if (r2 < x) {
			/* Guessed too low.  Watch out for rounding! */
			if (rmin == r)
				return r;

			rmin = r;
		} else {
			/* Bullseye! */
			return r;
		}
	}
}

fp_t fp_sqrtf(fp_t x)
{
	fp_inter_t preshift_x = (fp_inter_t)x << FP_BITS;

	return int_sqrtf(preshift_x);
}
#endif /* CONFIG_FPU */

int vector_magnitude(const intv3_t v)
{
	fp_inter_t sum =   (fp_inter_t)v[0] * v[0] +
			(fp_inter_t)v[1] * v[1] +
			(fp_inter_t)v[2] * v[2];

	return int_sqrtf(sum);
}

/* cross_product only works if the vectors magnitudes are around 1<<16. */
void cross_product(const intv3_t v1, const intv3_t v2, intv3_t v)
{
	v[X] = (fp_inter_t)v1[Y] * v2[Z] - (fp_inter_t)v1[Z] * v2[Y];
	v[Y] = (fp_inter_t)v1[Z] * v2[X] - (fp_inter_t)v1[X] * v2[Z];
	v[Z] = (fp_inter_t)v1[X] * v2[Y] - (fp_inter_t)v1[Y] * v2[X];
}

fp_inter_t dot_product(const intv3_t v1, const intv3_t v2)
{
	return (fp_inter_t)v1[X] * v2[X] +
	       (fp_inter_t)v1[Y] * v2[Y] +
	       (fp_inter_t)v1[Z] * v2[Z];
}

void vector_scale(intv3_t v, fp_t s)
{
	v[X] = fp_mul(v[X], s);
	v[Y] = fp_mul(v[Y], s);
	v[Z] = fp_mul(v[Z], s);
}

fp_t cosine_of_angle_diff(const intv3_t v1, const intv3_t v2)
{
	fp_inter_t dotproduct;
	fp_inter_t denominator;

	/*
	 * Angle between two vectors is acos(A dot B / |A|*|B|). To return
	 * cosine of angle between vectors, then don't do acos operation.
	 */
	dotproduct = dot_product(v1, v2);

	denominator = (fp_inter_t)vector_magnitude(v1) * vector_magnitude(v2);

	/* Check for divide by 0 although extremely unlikely. */
	if (!denominator)
		return 0;

	/*
	 * We must shift the dot product first, so that we can represent
	 * fractions.  The answer is always a number with magnitude < 1.0, so
	 * if we don't shift, it will always round down to 0.
	 *
	 * Note that overflow is possible if the dot product is large (that is,
	 * if the vector components are of size (31 - FP_BITS/2) bits.  If that
	 * ever becomes a problem, we could detect this by counting the leading
	 * zeroes of the dot product and shifting the denominator down
	 * partially instead of shifting the dot product up.  With the current
	 * FP_BITS=16, that happens if the vector components are ~2^23.  Which
	 * we're a long way away from; the vector components used in
	 * accelerometer calculations are ~2^11.
	 */
	return fp_div(dotproduct, denominator);
}

/*
 * rotate a vector v
 *  - support input v and output res are the same vector
 */
void rotate(const intv3_t v, const mat33_fp_t R, intv3_t res)
{
	fp_inter_t t[3];

	if (R == NULL) {
		if (v != res)
			memcpy(res, v, sizeof(intv3_t));
		return;
	}


	/* Rotate */
	t[0] =	(fp_inter_t)v[0] * R[0][0] +
		(fp_inter_t)v[1] * R[1][0] +
		(fp_inter_t)v[2] * R[2][0];
	t[1] =	(fp_inter_t)v[0] * R[0][1] +
		(fp_inter_t)v[1] * R[1][1] +
		(fp_inter_t)v[2] * R[2][1];
	t[2] =	(fp_inter_t)v[0] * R[0][2] +
		(fp_inter_t)v[1] * R[1][2] +
		(fp_inter_t)v[2] * R[2][2];

	/* Scale by fixed point shift when writing back to result */
	res[0] = FP_TO_INT(t[0]);
	res[1] = FP_TO_INT(t[1]);
	res[2] = FP_TO_INT(t[2]);
}

void rotate_inv(const intv3_t v, const mat33_fp_t R, intv3_t res)
{
	fp_inter_t t[3];
	fp_t deter;

	if (R == NULL) {
		if (v != res)
			memcpy(res, v, sizeof(intv3_t));
		return;
	}

	deter = fp_mul(R[0][0], (fp_mul(R[1][1], R[2][2]) -
				 fp_mul(R[2][1], R[1][2]))) -
		fp_mul(R[0][1], (fp_mul(R[1][0], R[2][2]) -
				 fp_mul(R[1][2], R[2][0]))) +
		fp_mul(R[0][2], (fp_mul(R[1][0], R[2][1]) -
				 fp_mul(R[1][1], R[2][0])));

	/*
	 * invert the matrix: from
	 * http://stackoverflow.com/questions/983999/
	 * simple-3x3-matrix-inverse-code-c
	 */
	t[0] =  (fp_inter_t)v[0] * (fp_mul(R[1][1], R[2][2]) -
				 fp_mul(R[2][1], R[1][2])) -
		(fp_inter_t)v[1] * (fp_mul(R[1][0], R[2][2]) -
				 fp_mul(R[1][2], R[2][0])) +
		(fp_inter_t)v[2] * (fp_mul(R[1][0], R[2][1]) -
				 fp_mul(R[2][0], R[1][1]));

	t[1] =	(fp_inter_t)v[0] * (fp_mul(R[0][1], R[2][2]) -
				 fp_mul(R[0][2], R[2][1])) * -1 +
		(fp_inter_t)v[1] * (fp_mul(R[0][0], R[2][2]) -
				 fp_mul(R[0][2], R[2][0])) -
		(fp_inter_t)v[2] * (fp_mul(R[0][0], R[2][1]) -
				 fp_mul(R[2][0], R[0][1]));

	t[2] =	(fp_inter_t)v[0] * (fp_mul(R[0][1], R[1][2]) -
				 fp_mul(R[0][2], R[1][1])) -
		(fp_inter_t)v[1] * (fp_mul(R[0][0], R[1][2]) -
				 fp_mul(R[1][0], R[0][2])) +
		(fp_inter_t)v[2] * (fp_mul(R[0][0], R[1][1]) -
				 fp_mul(R[1][0], R[0][1]));

	/* Scale by fixed point shift when writing back to result */
	res[0] = FP_TO_INT(fp_div(t[0], deter));
	res[1] = FP_TO_INT(fp_div(t[1], deter));
	res[2] = FP_TO_INT(fp_div(t[2], deter));
}

/* division that round to the nearest integer */
int round_divide(int64_t dividend, int divisor)
{
	return (dividend > 0) ^ (divisor > 0) ?
		(dividend - divisor / 2) / divisor :
		(dividend + divisor / 2) / divisor;
}

#if ULONG_MAX == 0xFFFFFFFFUL
/*
 * 32 bit processor
 *
 * Some 32 bit processors do not include a 64 bit shift
 * operation. So fall back to 32 bit operations on a
 * union.
 */
uint64_t bitmask_uint64(int offset)
{
	union mask64_t {
		struct {
#if (__BYTE_ORDER__  == __ORDER_LITTLE_ENDIAN__)
			uint32_t lo;
			uint32_t hi;
#elif (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
			uint32_t hi;
			uint32_t lo;
#endif
		};
		uint64_t val;
	} mask;

	/* Clear out the mask */
	mask.val = 0;

	/*
	 * If the shift is out of range the result should
	 * remain 0, otherwise perform the shift
	 */
	if (offset >= 0 && offset < 64) {
		if (offset < 32)
			mask.lo = BIT(offset);
		else
			mask.hi = BIT(offset - 32);
	}
	return mask.val;
}
#endif