// license:BSD-3-Clause
// copyright-holders:Vas Crabb
/***************************************************************************

    eigccx86.h

    x86 (32 and 64-bit) inline implementations for GCC compilers. This
    code is automatically included if appropriate by eminline.h.

***************************************************************************/

#ifndef __EIGCCX86__
#define __EIGCCX86__

/* Include MMX/SSE intrinsics headers */

#ifdef __SSE2__
#include <stdlib.h>
#include <mmintrin.h>   /* MMX */
#include <xmmintrin.h>  /* SSE */
#include <emmintrin.h>  /* SSE2 */
#endif


/***************************************************************************
    INLINE MATH FUNCTIONS
***************************************************************************/

union _x86_union
{
	uint64_t u64;
	struct {
		uint32_t l, h;
	} u32;
};

/*-------------------------------------------------
    mul_32x32 - perform a signed 32 bit x 32 bit
    multiply and return the full 64 bit result
-------------------------------------------------*/

#ifndef __x86_64__
#define mul_32x32 _mul_32x32
static inline int64_t ATTR_CONST ATTR_FORCE_INLINE
_mul_32x32(int32_t a, int32_t b)
{
	int64_t result;

	__asm__ (
		" imull  %[b] ;"
		: [result] "=A" (result)    /* result in edx:eax */
		: [a]      "%a"  (a)        /* 'a' should also be in eax on entry */
		, [b]      "rm"  (b)        /* 'b' can be memory or register */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    mulu_32x32 - perform an unsigned 32 bit x
    32 bit multiply and return the full 64 bit
    result
-------------------------------------------------*/

#ifndef __x86_64__
#define mulu_32x32 _mulu_32x32
static inline uint64_t ATTR_CONST ATTR_FORCE_INLINE
_mulu_32x32(uint32_t a, uint32_t b)
{
	uint64_t result;

	__asm__ (
		" mull  %[b] ;"
		: [result] "=A" (result)    /* result in edx:eax */
		: [a]      "%a"  (a)        /* 'a' should also be in eax on entry */
		, [b]      "rm"  (b)        /* 'b' can be memory or register */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    mul_32x32_hi - perform a signed 32 bit x 32 bit
    multiply and return the upper 32 bits of the
    result
-------------------------------------------------*/

#define mul_32x32_hi _mul_32x32_hi
static inline int32_t ATTR_CONST ATTR_FORCE_INLINE
_mul_32x32_hi(int32_t a, int32_t b)
{
	int32_t result, temp;

	__asm__ (
		" imull  %[b] ;"
		: [result] "=d"  (result)   /* result in edx */
		, [temp]   "=a"  (temp)     /* This is effectively a clobber */
		: [a]      "a"  (a)        /* 'a' should be in eax on entry */
		, [b]      "rm"  (b)        /* 'b' can be memory or register */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}


/*-------------------------------------------------
    mulu_32x32_hi - perform an unsigned 32 bit x
    32 bit multiply and return the upper 32 bits
    of the result
-------------------------------------------------*/

#define mulu_32x32_hi _mulu_32x32_hi
static inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
_mulu_32x32_hi(uint32_t a, uint32_t b)
{
	uint32_t result, temp;

	__asm__ (
		" mull  %[b] ;"
		: [result] "=d"  (result)   /* result in edx */
		, [temp]   "=a"  (temp)     /* This is effectively a clobber */
		: [a]      "a"   (a)        /* 'a' should be in eax on entry */
		, [b]      "rm"  (b)        /* 'b' can be memory or register */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}


/*-------------------------------------------------
    mul_32x32_shift - perform a signed 32 bit x
    32 bit multiply and shift the result by the
    given number of bits before truncating the
    result to 32 bits
-------------------------------------------------*/

#ifndef __x86_64__
#define mul_32x32_shift _mul_32x32_shift
static inline int32_t ATTR_CONST ATTR_FORCE_INLINE
_mul_32x32_shift(int32_t a, int32_t b, uint8_t shift)
{
	int32_t result;

	/* Valid for (0 <= shift <= 31) */
	__asm__ (
		" imull  %[b]                       ;"
		" shrdl  %[shift], %%edx, %[result] ;"
		: [result] "=a" (result)    /* result ends up in eax */
		: [a]      "%0" (a)         /* 'a' should also be in eax on entry */
		, [b]      "rm" (b)         /* 'b' can be memory or register */
		, [shift]  "Ic" (shift)     /* 'shift' must be constant in 0-31 range or in cl */
		: "%edx", "cc"              /* clobbers edx and condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    mulu_32x32_shift - perform an unsigned 32 bit x
    32 bit multiply and shift the result by the
    given number of bits before truncating the
    result to 32 bits
-------------------------------------------------*/

#ifndef __x86_64__
#define mulu_32x32_shift _mulu_32x32_shift
static inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
_mulu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
{
	uint32_t result;

	/* Valid for (0 <= shift <= 31) */
	__asm__ (
		" mull   %[b]                       ;"
		" shrdl  %[shift], %%edx, %[result] ;"
		: [result] "=a" (result)    /* result ends up in eax */
		: [a]      "%0" (a)         /* 'a' should also be in eax on entry */
		, [b]      "rm" (b)         /* 'b' can be memory or register */
		, [shift]  "Ic" (shift)     /* 'shift' must be constant in 0-31 range or in cl */
		: "%edx", "cc"              /* clobbers edx and condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    div_64x32 - perform a signed 64 bit x 32 bit
    divide and return the 32 bit quotient
-------------------------------------------------*/

#ifndef __x86_64__
#define div_64x32 _div_64x32
static inline int32_t ATTR_CONST ATTR_FORCE_INLINE
_div_64x32(int64_t a, int32_t b)
{
	int32_t result, temp;

	/* Throws arithmetic exception if result doesn't fit in 32 bits */
	__asm__ (
		" idivl  %[b] ;"
		: [result] "=a" (result)    /* Result ends up in eax */
		, [temp]   "=d" (temp)      /* This is effectively a clobber */
		: [a]      "A"  (a)         /* 'a' in edx:eax */
		, [b]      "rm" (b)         /* 'b' in register or memory */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    divu_64x32 - perform an unsigned 64 bit x 32 bit
    divide and return the 32 bit quotient
-------------------------------------------------*/

#ifndef __x86_64__
#define divu_64x32 _divu_64x32
static inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
_divu_64x32(uint64_t a, uint32_t b)
{
	uint32_t result, temp;

	/* Throws arithmetic exception if result doesn't fit in 32 bits */
	__asm__ (
		" divl  %[b] ;"
		: [result] "=a" (result)    /* Result ends up in eax */
		, [temp]   "=d" (temp)      /* This is effectively a clobber */
		: [a]      "A"  (a)         /* 'a' in edx:eax */
		, [b]      "rm" (b)         /* 'b' in register or memory */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    div_64x32_rem - perform a signed 64 bit x 32
    bit divide and return the 32 bit quotient and
    32 bit remainder
-------------------------------------------------*/

#ifndef __x86_64__
#define div_64x32_rem _div_64x32_rem
static inline int32_t ATTR_FORCE_INLINE
_div_64x32_rem(int64_t dividend, int32_t divisor, int32_t *remainder)
{
	int32_t quotient;

	/* Throws arithmetic exception if result doesn't fit in 32 bits */
	__asm__ (
		" idivl  %[divisor] ;"
		: [result]    "=a" (quotient)   /* Quotient ends up in eax */
		, [remainder] "=d" (*remainder) /* Remainder ends up in edx */
		: [dividend]  "A"  (dividend)   /* 'dividend' in edx:eax */
		, [divisor]   "rm" (divisor)    /* 'divisor' in register or memory */
		: "cc"                          /* Clobbers condition codes */
	);

	return quotient;
}
#endif


/*-------------------------------------------------
    divu_64x32_rem - perform an unsigned 64 bit x
    32 bit divide and return the 32 bit quotient
    and 32 bit remainder
-------------------------------------------------*/

#ifndef __x86_64__
#define divu_64x32_rem _divu_64x32_rem
static inline uint32_t ATTR_FORCE_INLINE
_divu_64x32_rem(uint64_t dividend, uint32_t divisor, uint32_t *remainder)
{
	uint32_t quotient;

	/* Throws arithmetic exception if result doesn't fit in 32 bits */
	__asm__ (
		" divl  %[divisor] ;"
		: [result]    "=a" (quotient)   /* Quotient ends up in eax */
		, [remainder] "=d" (*remainder) /* Remainder ends up in edx */
		: [dividend]  "A"  (dividend)   /* 'dividend' in edx:eax */
		, [divisor]   "rm" (divisor)    /* 'divisor' in register or memory */
		: "cc"                          /* Clobbers condition codes */
	);

	return quotient;
}
#else
#define divu_64x32_rem _divu_64x32_rem
static inline uint32_t ATTR_FORCE_INLINE
_divu_64x32_rem(uint64_t dividend, uint32_t divisor, uint32_t *remainder)
{
	uint32_t quotient;
	_x86_union r;

	r.u64 = dividend;

	/* Throws arithmetic exception if result doesn't fit in 32 bits */
	__asm__ (
		" divl  %[divisor] ;"
		: [result]    "=a" (quotient)   /* Quotient ends up in eax */
		, [remainder] "=d" (*remainder) /* Remainder ends up in edx */
		: [divl]  "a"  (r.u32.l)        /* 'dividend' in edx:eax */
		, [divh]  "d"  (r.u32.h)
		, [divisor]   "rm" (divisor)    /* 'divisor' in register or memory */
		: "cc"                          /* Clobbers condition codes */
	);

	return quotient;
}
#endif


/*-------------------------------------------------
    div_32x32_shift - perform a signed divide of
    two 32 bit values, shifting the first before
    division, and returning the 32 bit quotient
-------------------------------------------------*/

#ifndef __x86_64__
#define div_32x32_shift _div_32x32_shift
static inline int32_t ATTR_CONST ATTR_FORCE_INLINE
_div_32x32_shift(int32_t a, int32_t b, uint8_t shift)
{
	int32_t result;

	/* Valid for (0 <= shift <= 31) */
	/* Throws arithmetic exception if result doesn't fit in 32 bits */
	__asm__ (
		" cdq                          ;"
		" shldl  %[shift], %[a], %%edx ;"
		" shll   %[shift], %[a]        ;"
		" idivl  %[b]                  ;"
		: [result] "=&a" (result)   /* result ends up in eax */
		: [a]      "0"   (a)        /* 'a' should also be in eax on entry */
		, [b]      "rm"  (b)        /* 'b' can be memory or register */
		, [shift]  "Ic"  (shift)    /* 'shift' must be constant in 0-31 range or in cl */
		: "%edx", "cc"              /* clobbers edx and condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    divu_32x32_shift - perform an unsigned divide of
    two 32 bit values, shifting the first before
    division, and returning the 32 bit quotient
-------------------------------------------------*/

#ifndef __x86_64__
#define divu_32x32_shift _divu_32x32_shift
static inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
_divu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
{
	int32_t result;

	/* Valid for (0 <= shift <= 31) */
	/* Throws arithmetic exception if result doesn't fit in 32 bits */
	__asm__ (
		" clr    %%edx                 ;"
		" shldl  %[shift], %[a], %%edx ;"
		" shll   %[shift], %[a]        ;"
		" divl   %[b]                  ;"
		: [result] "=&a" (result)   /* result ends up in eax */
		: [a]      "0"   (a)        /* 'a' should also be in eax on entry */
		, [b]      "rm"  (b)        /* 'b' can be memory or register */
		, [shift]  "Ic"  (shift)    /* 'shift' must be constant in 0-31 range or in cl */
		: "%edx", "cc"              /* clobbers edx and condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    mod_64x32 - perform a signed 64 bit x 32 bit
    divide and return the 32 bit remainder
-------------------------------------------------*/

#ifndef __x86_64__
#define mod_64x32 _mod_64x32
static inline int32_t ATTR_CONST ATTR_FORCE_INLINE
_mod_64x32(int64_t a, int32_t b)
{
	int32_t result, temp;

	/* Throws arithmetic exception if quotient doesn't fit in 32 bits */
	__asm__ (
		" idivl  %[b] ;"
		: [result] "=d" (result)    /* Result ends up in edx */
		, [temp]   "=a" (temp)      /* This is effectively a clobber */
		: [a]      "A"  (a)         /* 'a' in edx:eax */
		, [b]      "rm" (b)         /* 'b' in register or memory */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    modu_64x32 - perform an unsigned 64 bit x 32 bit
    divide and return the 32 bit remainder
-------------------------------------------------*/

#ifndef __x86_64__
#define modu_64x32 _modu_64x32
static inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
_modu_64x32(uint64_t a, uint32_t b)
{
	uint32_t result, temp;

	/* Throws arithmetic exception if quotient doesn't fit in 32 bits */
	__asm__ (
		" divl  %[b] ;"
		: [result] "=d" (result)    /* Result ends up in edx */
		, [temp]   "=a" (temp)      /* This is effectively a clobber */
		: [a]      "A"  (a)         /* 'a' in edx:eax */
		, [b]      "rm" (b)         /* 'b' in register or memory */
		: "cc"                      /* Clobbers condition codes */
	);

	return result;
}
#endif


/*-------------------------------------------------
    recip_approx - compute an approximate floating
    point reciprocal
-------------------------------------------------*/

#ifdef __SSE2__
#define recip_approx _recip_approx
static inline float ATTR_CONST
_recip_approx(float value)
{
	__m128 value_xmm = _mm_set_ss(value);
	__m128 result_xmm = _mm_rcp_ss(value_xmm);
	float result;
	_mm_store_ss(&result, result_xmm);
	return result;
}
#endif



/***************************************************************************
    INLINE BIT MANIPULATION FUNCTIONS
***************************************************************************/

/*-------------------------------------------------
    count_leading_zeros - return the number of
    leading zero bits in a 32-bit value
-------------------------------------------------*/

#define count_leading_zeros _count_leading_zeros
static inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
_count_leading_zeros(uint32_t value)
{
	uint32_t result;

	__asm__ (
		"   bsrl  %[value], %[result] ;"
		"   jnz   1f                  ;"
		"   movl  $63, %[result]      ;"
		"1: xorl  $31, %[result]      ;"
		: [result] "=r" (result)    /* result can be in any register */
		: [value]  "rm" (value)     /* 'value' can be register or memory */
		: "cc"                      /* clobbers condition codes */
	);

	return result;
}


/*-------------------------------------------------
    count_leading_ones - return the number of
    leading one bits in a 32-bit value
-------------------------------------------------*/

#define count_leading_ones _count_leading_ones
static inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
_count_leading_ones(uint32_t value)
{
	uint32_t result;

	__asm__ (
		"   movl  %[value], %[result]  ;"
		"   notl  %[result]            ;"
		"   bsrl  %[result], %[result] ;"
		"   jnz   1f                   ;"
		"   movl  $63, %[result]       ;"
		"1: xorl  $31, %[result]       ;"
		: [result] "=r"  (result)   /* result can be in any register */
		: [value]  "rmi" (value)    /* 'value' can be register, memory or immediate */
		: "cc"                      /* clobbers condition codes */
	);

	return result;
}

#endif /* __EIGCCX86__ */
