/* dspfns.h
 *
 * Copyright 2001 ARM Limited. All rights reserved.
 *
 * RCS $Revision: 136973 $
 * Checkin $Date: 2008-08-20 09:29:55 +0100 (Wed, 20 Aug 2008) $
 * Revising $Author: agrant $
 */

/* ----------------------------------------------------------------------
 * This header file provides a set of DSP-type primitive
 * operations, such as 16-bit and 32-bit saturating arithmetic. The
 * operations it provides are similar to the ones used by the ITU
 * for publishing specifications of DSP algorithms.
 */

#ifndef ARMDSP_DSPFNS_H
#define ARMDSP_DSPFNS_H

#ifdef __cplusplus
#define __STDC_LIMIT_MACROS 1
#endif /* __cplusplus */
#include <stdint.h>
#include <assert.h>

#ifndef MAX_16
#define MAX_16 INT16_MAX
#define MIN_16 INT16_MIN
#define MAX_32 INT32_MAX
#define MIN_32 INT32_MIN
#endif /* MAX_16 etc. */

#ifndef __TARGET_FEATURE_DSPMUL
#error ETSI intrinsics not currently emulated on this platform
#endif /* __TARGET_FEATURE_DSPMUL */

#if defined(__thumb) && (__TARGET_ARCH_THUMB < 4)
#error ETSI intrinsics not available on Thumb-1
#endif /* Thumb but not Thumb-2 */

#pragma recognize_itu_functions /* enable vectorization of ITU functions */

#ifndef __BIG_ENDIAN

typedef union {
  struct {
    int _dnm:27;
    int Q:1;
    int V:1;
    int C:1;
    int Z:1;
    int N:1;
  } b;
  unsigned int word;
} _ARM_PSR;

#else /* __BIG_ENDIAN */

typedef union {
  struct {
    int N:1;
    int Z:1;
    int C:1;
    int V:1;
    int Q:1;
    int _dnm:27;
  } b;
  unsigned int word;
} _ARM_PSR;

#endif /* __BIG_ENDIAN */

register _ARM_PSR _apsr_for_q __asm("apsr");
#define Overflow _apsr_for_q.b.Q

#ifdef __cplusplus
#define __ARM_INTRINSIC __forceinline
#elif defined __GNUC__ || defined _USE_STATIC_INLINE
#define __ARM_INTRINSIC static __forceinline
#elif (defined(__STDC_VERSION__) && 199901L <= __STDC_VERSION__)
#define __ARM_INTRINSIC __forceinline
#else
#define __ARM_INTRINSIC __forceinline
#endif

#ifndef __ARM_DSP_IGNORE_OVERFLOW
#define __ARM_DSP_IGNORE_OVERFLOW 0
#endif

__ARM_INTRINSIC int *_arm_global_carry(void) {
    static int c;
    return &c;
}

#define Carry (*_arm_global_carry())

#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif

/*
 * Convert a 32-bit signed integer into a 16-bit signed integer by
 * saturation.
 */
__ARM_INTRINSIC int16_t saturate(int32_t x)
{
#if (defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6)
    return (int16_t)__ssat(x, 16);
#else
    /* ARM v5E has no SSAT instruction */
    if (x > INT16_MAX)
        Overflow = 1, x = INT16_MAX;
    else if (x < INT16_MIN)
        Overflow = 1, x = INT16_MIN;
    return (int16_t) x;
#endif
}

/*
 * Add two 16-bit signed integers with saturation.
 */
__ARM_INTRINSIC int16_t add(int16_t x, int16_t y)
{
#if __ARM_DSP_IGNORE_OVERFLOW && ((defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6))
    return (int16_t)__qadd16(x, y);
#else
    return (int16_t)(__qadd(x<<16, y<<16) >> 16);
#endif
}

/*
 * Subtract one 16-bit signed integer from another with saturation.
 */
__ARM_INTRINSIC int16_t sub(int16_t x, int16_t y)
{
#if __ARM_DSP_IGNORE_OVERFLOW && ((defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6))
    return (int16_t)__qsub16(x, y);
#else
    return (int16_t)(__qsub(x<<16, y<<16) >> 16);
#endif
}

/*
 * Absolute value of a 16-bit signed integer. Saturating, so
 * abs(-0x8000) becomes +0x7FFF.
 */
__ARM_INTRINSIC int16_t abs_s(int16_t x)
{
    if (x >= 0)
        return x;
#if ((defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6))
    return (int16_t)__qsub16(0, x);
#else
    else if (x == INT16_MIN)
        return INT16_MAX;
    else
        return (int16_t) -x;
#endif
}

/*
 * Shift a 16-bit signed integer left (or right, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int16_t shl(int16_t x, int16_t shift)
{
    if (shift <= 0 || x == 0)
        return (int16_t) (x >> (-shift));
    if (shift > 15)
        shift = 16;
    return saturate(x << shift);
}

/*
 * Shift a 16-bit signed integer right (or left, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int16_t shr(int16_t x, int16_t shift)
{
    if (shift >= 0 || x == 0)
        return (int16_t) (x >> shift);
    if (shift < -15)
        shift = -16;
    return saturate(x << (-shift));
}

/*
 * Multiply two 16-bit signed integers, shift the result right by
 * 15 and saturate it. (Saturation is only necessary if both inputs
 * were -0x8000, in which case the result "should" be 0x8000 and is
 * saturated to 0x7FFF.)
 */
__ARM_INTRINSIC int16_t mult(int16_t x, int16_t y)
{
    return (int16_t)(__qdbl(x*y) >> 16);
}

/*
 * Multiply two 16-bit signed integers to give a 32-bit signed
 * integer. Shift left by one, and saturate the result. (Saturation
 * is only necessary if both inputs were -0x8000, in which case the
 * result "should" be 0x40000000 << 1 = +0x80000000, and is
 * saturated to +0x7FFFFFFF.)
 */
__ARM_INTRINSIC int32_t L_mult(int16_t x, int16_t y)
{
    return __qdbl(x*y);
}

/*
 * Negate a 16-bit signed integer, with saturation. (Saturation is
 * only necessary when the input is -0x8000.)
 */
__ARM_INTRINSIC int16_t negate(int16_t x)
{
#if (defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6)
    return (int16_t)__qsub16(0, x);
#else
    if (x == INT16_MIN)
        return INT16_MAX;
    return (int16_t) -x;
#endif
}

/*
 * Return the top 16 bits of a 32-bit signed integer.
 */
__ARM_INTRINSIC int16_t extract_h(int32_t x)
{
    return (int16_t) (x >> 16);
}

/*
 * Return the bottom 16 bits of a 32-bit signed integer, with no
 * saturation, just coerced into a two's complement 16 bit
 * representation.
 */
__ARM_INTRINSIC int16_t extract_l(int32_t x)
{
    return (int16_t) x;
}

/*
 * Divide a 32-bit signed integer by 2^16, rounding to the nearest
 * integer (round up on a tie). Equivalent to adding 0x8000 with
 * saturation, then shifting right by 16.
 */
__ARM_INTRINSIC int16_t round(int32_t x)
{
    return extract_h(__qadd(x, 0x8000));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and add to
 * another 32-bit integer with saturation.
 * 
 * Note the intermediate saturation operation in the definition:
 * 
 *    L_mac(-1, -0x8000, -0x8000)
 * 
 * will give 0x7FFFFFFE and not 0x7FFFFFFF:
 *    the unshifted product is:   0x40000000
 *    shift left with saturation: 0x7FFFFFFF
 *    add to -1 with saturation:  0x7FFFFFFE
 */
__ARM_INTRINSIC int32_t L_mac(int32_t accumulator, int16_t x, int16_t y)
{
    return __qadd(accumulator, __qdbl(x*y));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and subtract
 * from another 32-bit integer with saturation.
 * 
 * Note the intermediate saturation operation in the definition:
 * 
 *    L_msu(1, -0x8000, -0x8000)
 * 
 * will give 0x80000002 and not 0x80000001:
 *    the unshifted product is:         0x40000000
 *    shift left with saturation:       0x7FFFFFFF
 *    subtract from 1 with saturation:  0x80000002
 */
__ARM_INTRINSIC int32_t L_msu(int32_t accumulator, int16_t x, int16_t y)
{
    return __qsub(accumulator, __qdbl(x*y));
}

/*
 * Add two 32-bit signed integers with saturation.
 */
__ARM_INTRINSIC int32_t L_add(int32_t x, int32_t y)
{
    return __qadd(x, y);
}

/*
 * Subtract one 32-bit signed integer from another with saturation.
 */
__ARM_INTRINSIC int32_t L_sub(int32_t x, int32_t y)
{
    return __qsub(x, y);
}

/*
 * Add together the Carry variable and two 32-bit signed integers,
 * without saturation.
 */
__ARM_INTRINSIC int32_t L_add_c(int32_t x, int32_t y)
{
    int32_t result, flags;

    __asm {
        movs flags, Carry, lsr #1
        adcs result, x, y;
        mrs flags, CPSR;
    }
    if (flags & 0x10000000) Overflow = 1;  /* V -> Q */
    Carry = (flags & 0x20000000) != 0;
    return result;
}

/*
 * Subtract one 32-bit signed integer, together with the logical
 * negation of the Carry variable, from another 32-bit signed integer,
 * without saturation.
 * N.b. the computation matches that of the ETSI reference function
 * (in basicop2.c).  The comment above the ETSI reference function says
 * that L_sub_c(a,b) = a-b-C, but that does not match their code.
 */
__ARM_INTRINSIC int32_t L_sub_c(int32_t x, int32_t y)
{
    int32_t result, flags;

    __asm {
        movs flags, Carry, lsr #1
        sbcs result, x, y;
        mrs flags, CPSR;
    }
    if (flags & 0x10000000) Overflow = 1;  /* V -> Q */
    Carry = (flags & 0x20000000) != 0;
    return result;
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one _with_ saturation, and add
 * with carry to another 32-bit integer _without_ saturation.
 */
__ARM_INTRINSIC int32_t L_macNs(int32_t accumulator, int16_t x, int16_t y)
{
    return L_add_c(accumulator, L_mult(x, y));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one _with_ saturation, and
 * subtract with carry from another 32-bit integer _without_
 * saturation.
 */
__ARM_INTRINSIC int32_t L_msuNs(int32_t accumulator, int16_t x, int16_t y)
{
    return L_sub_c(accumulator, L_mult(x, y));
}

/*
 * Negate a 32-bit signed integer, with saturation. (Saturation is
 * only necessary when the input is -0x80000000.)
 */
__ARM_INTRINSIC int32_t L_negate(int32_t x)
{
    return __qsub(0, x);
}

/*
 * Multiply two 16-bit signed integers, shift the result right by
 * 15 with rounding, and saturate it. (Saturation is only necessary
 * if both inputs were -0x8000, in which case the result "should"
 * be 0x8000 and is saturated to 0x7FFF.)
 */
__ARM_INTRINSIC int16_t mult_r(int16_t x, int16_t y)
{
    return (int16_t)(__qadd(0x8000, __qdbl(x*y)) >> 16);
}

/*
 * Return the number of bits of left shift needed to arrange for a
 * 16-bit signed integer to have value >= 0x4000 or <= -0x4001.
 * 
 * Returns 0 if x is zero (following C reference implementation).
 */
__ARM_INTRINSIC int16_t norm_s(int16_t x)
{
    return __clz(x ^ ((int32_t)x << 17)) & 15;
}

/*
 * Return the number of bits of left shift needed to arrange for a
 * 32-bit signed integer to have value >= 0x40000000 (if +ve)
 * or <= -0x40000001 (if -ve).
 * 
 * Returns 0 if x is zero (following C reference implementation).
 */
__ARM_INTRINSIC int16_t norm_l(int32_t x)
{
    return __clz(x ^ (x << 1)) & 31;
}

/*
 * Shift a 32-bit signed integer left (or right, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int32_t L_shl(int32_t x, int16_t shift)
{
    if (shift <= 0)
        return x >> (-shift);
    if (shift <= norm_l(x) || x == 0)
        return x << shift;
    if (x < 0)
        return INT32_MIN;
    else
        return INT32_MAX;
}

/*
 * Shift a 32-bit signed integer right (or left, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int32_t L_shr(int32_t x, int16_t shift)
{
    if (shift >= 0)
        return x >> shift;
    if ((-shift) <= norm_l(x) || x == 0)
        return x << (-shift);
    if (x < 0)
        return INT32_MIN;
    else
        return INT32_MAX;
}

/*
 * Shift a 16-bit signed integer right, with rounding. Shift left
 * with saturation if the shift count is negative.
 */
__ARM_INTRINSIC int16_t shr_r(int16_t x, int16_t shift)
{
    if (shift == 0 || x == 0)
        return (int16_t)x;
    if (shift > 0)
        return (int16_t) (((x >> (shift-1)) + 1) >> 1);
    if (shift < -15)
        shift = -16;
    return saturate(x << (-shift));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and add to
 * another 32-bit integer with saturation (like L_mac). Then shift
 * the result right by 15 bits with rounding (like round).
 */
__ARM_INTRINSIC int16_t mac_r(int32_t accumulator, int16_t x, int16_t y)
{
    return round(L_mac(accumulator, x, y));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and subtract
 * from another 32-bit integer with saturation (like L_msu). Then
 * shift the result right by 15 bits with rounding (like round).
 */
__ARM_INTRINSIC int16_t msu_r(int32_t accumulator, int16_t x, int16_t y)
{
    return round(L_msu(accumulator, x, y));
}

/*
 * Shift a 16-bit signed integer left by 16 bits to generate a
 * 32-bit signed integer. The bottom 16 bits are zeroed.
 */
__ARM_INTRINSIC int32_t L_deposit_h(int16_t x)
{
    return ((int32_t)x) << 16;
}

/*
 * Sign-extend a 16-bit signed integer by 16 bits to generate a
 * 32-bit signed integer.
 */
__ARM_INTRINSIC int32_t L_deposit_l(int16_t x)
{
    return (int32_t)x;
}

/*
 * Shift a 32-bit signed integer right, with rounding. Shift left
 * with saturation if the shift count is negative.
 */
__ARM_INTRINSIC int32_t L_shr_r(int32_t x, int16_t shift)
{
    if (shift == 0 || x == 0)
        return x;
    if (shift > 0) {
        int32_t x2 = x >> (shift-1);
        return (x2 >> 1) + (x2 & 1);
    }
    if (-shift <= norm_l(x) || x == 0)
        return x << (-shift);
    if (x < 0)
        return INT32_MIN;
    else
        return INT32_MAX;
}

/*
 * Absolute value of a 32-bit signed integer. Saturating, so
 * abs(-0x80000000) becomes +0x7FFFFFFF.
 */
__ARM_INTRINSIC int32_t L_abs(int32_t x)
{
    if (x >= 0)
        return x;
    else
        return __qsub(0, x);
}

/*
 * Return a saturated value appropriate to the most recent carry-
 * affecting operation (L_add_c, L_macNs, L_sub_c, L_msuNs).
 * 
 * In other words: return the argument if the Q flag is clear.
 * Otherwise, return -0x80000000 or +0x7FFFFFFF depending on
 * whether the Carry flag is set or clear (respectively).
 */
__ARM_INTRINSIC int32_t L_sat(int32_t x)
{
    if (!Overflow)
        return x;
    if (Carry)
        return INT32_MIN;
    else
        return INT32_MAX;
}

/*
 * Divide one 16-bit signed integer by another, and produce a
 * 15-bit fixed point fractional result (by multiplying the true
 * mathematical result by 0x8000). The divisor (denominator) is
 * assumed to be non-zero and also assumed to be greater or equal
 * to the dividend (numerator). Hence the (unscaled) result is
 * necessarily within the range [0,1].
 * 
 * Both operands are assumed to be positive.
 * 
 * After division, the result is saturated to fit into a 16-bit
 * signed integer. (The only way this can happen is if the operands
 * are equal, so that the result would be 1, i.e. +0x8000 in 15-bit
 * fixed point.)
 */
__ARM_INTRINSIC int16_t div_s(int16_t x, int16_t y)
{
    int32_t quot;
    assert(y > 0);
    assert(x >= 0);
    assert(x <= y);
    quot = 0x8000 * x;
    quot /= y;
    if (quot > INT16_MAX)
        return INT16_MAX;
    else
        return (int16_t)quot;
}

#ifdef __cplusplus
}
#endif

#endif /* ARMDSP_DSPFNS_H */