Update opus from 1.1.4 to 1.2.1

This commit is contained in:
Zack Middleton 2018-03-16 13:03:37 -05:00
parent c38c823a2a
commit cb24c59567
155 changed files with 6263 additions and 3968 deletions

View file

@ -164,11 +164,11 @@ while (<>) {
$prefix = "";
if ($proc)
{
$prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc) unless ($apple);
$prefix = $prefix.sprintf("\t.type\t%s, %%function", $proc) unless ($apple);
# Make sure we $prefix isn't empty here (for the $apple case).
# We handle mangling the label here, make sure it doesn't match
# the label handling below (if $prefix would be empty).
$prefix = "; ";
$prefix = $prefix."; ";
push(@proc_stack, $proc);
s/^[A-Za-z_\.]\w+/$symprefix$&:/;
}

View file

@ -35,12 +35,29 @@
#if defined(OPUS_HAVE_RTCD)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
celt_inner_prod_c, /* ARMv4 */
celt_inner_prod_c, /* EDSP */
celt_inner_prod_c, /* Media */
celt_inner_prod_neon /* NEON */
};
void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2) = {
dual_inner_prod_c, /* ARMv4 */
dual_inner_prod_c, /* EDSP */
dual_inner_prod_c, /* Media */
dual_inner_prod_neon /* NEON */
};
# endif
# if defined(FIXED_POINT)
# if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
(defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int , int) = {
const opus_val16 *, opus_val32 *, int, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
@ -51,7 +68,7 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
# else /* !FIXED_POINT */
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int) = {
const opus_val16 *, opus_val32 *, int, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
celt_pitch_xcorr_c, /* EDSP */
celt_pitch_xcorr_c, /* Media */

View file

@ -36,7 +36,6 @@
#endif
#endif
#include <NE10_init.h>
#include <NE10_dsp.h>
#include "os_support.h"
#include "kiss_fft.h"

View file

@ -191,107 +191,10 @@ static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
vst1q_f32(sum, SUMM);
}
/*
* Function: xcorr_kernel_neon_float_process1
* ---------------------------------
* Computes single correlation values and stores in *sum
*/
static void xcorr_kernel_neon_float_process1(const float32_t *x,
const float32_t *y, float32_t *sum, int len) {
float32x4_t XX[4];
float32x4_t YY[4];
float32x2_t XX_2;
float32x2_t YY_2;
float32x4_t SUMM;
float32x2_t SUMM_2[2];
const float32_t *xi = x;
const float32_t *yi = y;
SUMM = vdupq_n_f32(0);
/* Work on 16 values per iteration */
while (len >= 16) {
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
XX[2] = vld1q_f32(xi);
xi += 4;
XX[3] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
YY[2] = vld1q_f32(yi);
yi += 4;
YY[3] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
len -= 16;
}
/* Work on 8 values */
if (len >= 8) {
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
len -= 8;
}
/* Work on 4 values */
if (len >= 4) {
XX[0] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
len -= 4;
}
/* Start accumulating results */
SUMM_2[0] = vget_low_f32(SUMM);
if (len >= 2) {
/* While at it, consume 2 more values if available */
XX_2 = vld1_f32(xi);
xi += 2;
YY_2 = vld1_f32(yi);
yi += 2;
SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
len -= 2;
}
SUMM_2[1] = vget_high_f32(SUMM);
SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
/* Ok, now we have result accumulated in SUMM_2[0].0 */
if (len > 0) {
/* Case when you have one value left */
XX_2 = vld1_dup_f32(xi);
YY_2 = vld1_dup_f32(yi);
SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
}
vst1_lane_f32(sum, SUMM_2[0], 0);
}
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch) {
opus_val32 *xcorr, int len, int max_pitch, int arch) {
int i;
(void)arch;
celt_assert(max_pitch > 0);
celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
@ -300,12 +203,9 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
(float32_t *)xcorr+i, len);
}
/* In case max_pitch isn't multiple of 4
* compute single correlation value per iteration
*/
/* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
for (; i < max_pitch; i++) {
xcorr_kernel_neon_float_process1((const float32_t *)_x,
(const float32_t *)_y+i, (float32_t *)xcorr+i, len);
xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
}
}
#endif

View file

@ -44,7 +44,7 @@
.if OPUS_ARM_MAY_HAVE_NEON
@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
; xcorr_kernel_neon: @ PROC
.type xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
xcorr_kernel_neon_start:
@ input:
@ r3 = int len
@ -156,8 +156,8 @@ xcorr_kernel_neon_process1:
.size xcorr_kernel_neon, .-xcorr_kernel_neon @ ENDP
@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
@ opus_val32 *xcorr, int len, int max_pitch)
; celt_pitch_xcorr_neon: @ PROC
@ opus_val32 *xcorr, int len, int max_pitch, int arch)
.type celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
@ input:
@ r0 = opus_val16 *_x
@ r1 = opus_val16 *_y
@ -171,6 +171,8 @@ xcorr_kernel_neon_process1:
@ r6 = int max_pitch
@ r12 = int j
@ q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
@ ignored:
@ int arch
STMFD sp!, {r4-r6, lr}
LDR r6, [sp, #16]
VMOV.S32 q15, #1
@ -260,7 +262,7 @@ celt_pitch_xcorr_neon_done:
@ This will get used on ARMv7 devices without NEON, so it has been optimized
@ to take advantage of dual-issuing where possible.
; xcorr_kernel_edsp: @ PROC
.type xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
xcorr_kernel_edsp_start:
@ input:
@ r3 = int len
@ -344,7 +346,7 @@ xcorr_kernel_edsp_done:
LDMFD sp!, {r2,r4,r5,pc}
.size xcorr_kernel_edsp, .-xcorr_kernel_edsp @ ENDP
; celt_pitch_xcorr_edsp: @ PROC
.type celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
@ input:
@ r0 = opus_val16 *_x (must be 32-bit aligned)
@ r1 = opus_val16 *_y (only needs to be 16-bit aligned)
@ -361,6 +363,8 @@ xcorr_kernel_edsp_done:
@ r9 = opus_val32 sum3
@ r1 = int max_pitch
@ r12 = int j
@ ignored:
@ int arch
STMFD sp!, {r4-r11, lr}
MOV r5, r1
LDR r1, [sp, #36]

View file

@ -153,7 +153,7 @@ xcorr_kernel_neon_process1
ENDP
; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
; opus_val32 *xcorr, int len, int max_pitch)
; opus_val32 *xcorr, int len, int max_pitch, int arch)
celt_pitch_xcorr_neon PROC
; input:
; r0 = opus_val16 *_x
@ -168,6 +168,8 @@ celt_pitch_xcorr_neon PROC
; r6 = int max_pitch
; r12 = int j
; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
; ignored:
; int arch
STMFD sp!, {r4-r6, lr}
LDR r6, [sp, #16]
VMOV.S32 q15, #1
@ -358,6 +360,8 @@ celt_pitch_xcorr_edsp PROC
; r9 = opus_val32 sum3
; r1 = int max_pitch
; r12 = int j
; ignored:
; int arch
STMFD sp!, {r4-r11, lr}
MOV r5, r1
LDR r1, [sp, #36]

View file

@ -34,7 +34,6 @@
#if !defined(FFT_ARM_H)
#define FFT_ARM_H
#include "config.h"
#include "kiss_fft.h"
#if defined(HAVE_ARM_NE10)

View file

@ -37,7 +37,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
"#MULT16_32_Q16\n\t"
"smull %0, %1, %2, %3\n\t"
: "=&r"(rd_lo), "=&r"(rd_hi)
: "%r"(b),"r"(a<<16)
: "%r"(b),"r"(SHL32(a,16))
);
return rd_hi;
}
@ -54,10 +54,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
"#MULT16_32_Q15\n\t"
"smull %0, %1, %2, %3\n\t"
: "=&r"(rd_lo), "=&r"(rd_hi)
: "%r"(b), "r"(a<<16)
: "%r"(b), "r"(SHL32(a,16))
);
/*We intentionally don't OR in the high bit of rd_lo for speed.*/
return rd_hi<<1;
return SHL32(rd_hi,1);
}
#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))

View file

@ -59,7 +59,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
: "=r"(res)
: "r"(b), "r"(a)
);
return res<<1;
return SHL32(res,1);
}
#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
@ -76,7 +76,7 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
"#MAC16_32_Q15\n\t"
"smlawb %0, %1, %2, %3;\n"
: "=r"(res)
: "r"(b<<1), "r"(a), "r"(c)
: "r"(SHL32(b,1)), "r"(a), "r"(c)
);
return res;
}

View file

@ -33,7 +33,6 @@
#if !defined(MDCT_ARM_H)
#define MDCT_ARM_H
#include "config.h"
#include "mdct.h"
#if defined(HAVE_ARM_NE10)

View file

@ -30,11 +30,47 @@
# include "armcpu.h"
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
# define OVERRIDE_CELT_INNER_PROD (1)
# define OVERRIDE_DUAL_INNER_PROD (1)
# define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
# endif
# endif
# if !defined(OVERRIDE_CELT_INNER_PROD)
# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
# define OVERRIDE_CELT_INNER_PROD (1)
# define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_CELT_INNER_PROD (1)
# define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
# endif
# endif
# if !defined(OVERRIDE_DUAL_INNER_PROD)
# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
# define OVERRIDE_DUAL_INNER_PROD (1)
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_DUAL_INNER_PROD (1)
# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
# endif
# endif
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON)
opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
opus_val32 *xcorr, int len, int max_pitch, int arch);
# endif
# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
@ -43,7 +79,7 @@ opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
# if defined(OPUS_ARM_MAY_HAVE_EDSP)
opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
opus_val32 *xcorr, int len, int max_pitch, int arch);
# endif
# if defined(OPUS_HAVE_RTCD) && \
@ -52,18 +88,15 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
extern opus_val32
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int);
const opus_val16 *, opus_val32 *, int, int, int);
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
xcorr, len, max_pitch))
# define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
# elif defined(OPUS_ARM_PRESUME_EDSP) || \
defined(OPUS_ARM_PRESUME_MEDIA) || \
defined(OPUS_ARM_PRESUME_NEON)
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
# define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
# endif
@ -99,25 +132,22 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
/* Float case */
#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
opus_val32 *xcorr, int len, int max_pitch, int arch);
#endif
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void
(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int);
const opus_val16 *, opus_val32 *, int, int, int);
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
xcorr, len, max_pitch))
# define celt_pitch_xcorr (*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
# define celt_pitch_xcorr celt_pitch_xcorr_float_neon
# endif

View file

@ -0,0 +1,290 @@
/***********************************************************************
Copyright (c) 2017 Google Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of Internet Society, IETF or IETF Trust, nor the
names of specific contributors, may be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***********************************************************************/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <arm_neon.h>
#include "pitch.h"
#ifdef FIXED_POINT
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
{
int i;
opus_val32 xy;
int16x8_t x_s16x8, y_s16x8;
int32x4_t xy_s32x4 = vdupq_n_s32(0);
int64x2_t xy_s64x2;
int64x1_t xy_s64x1;
for (i = 0; i < N - 7; i += 8) {
x_s16x8 = vld1q_s16(&x[i]);
y_s16x8 = vld1q_s16(&y[i]);
xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
}
if (N - i >= 4) {
const int16x4_t x_s16x4 = vld1_s16(&x[i]);
const int16x4_t y_s16x4 = vld1_s16(&y[i]);
xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
i += 4;
}
xy_s64x2 = vpaddlq_s32(xy_s32x4);
xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
for (; i < N; i++) {
xy = MAC16_16(xy, x[i], y[i]);
}
#ifdef OPUS_CHECK_ASM
celt_assert(celt_inner_prod_c(x, y, N) == xy);
#endif
return xy;
}
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
opus_val32 xy01, xy02;
int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
int32x4_t xy01_s32x4 = vdupq_n_s32(0);
int32x4_t xy02_s32x4 = vdupq_n_s32(0);
int64x2_t xy01_s64x2, xy02_s64x2;
int64x1_t xy01_s64x1, xy02_s64x1;
for (i = 0; i < N - 7; i += 8) {
x_s16x8 = vld1q_s16(&x[i]);
y01_s16x8 = vld1q_s16(&y01[i]);
y02_s16x8 = vld1q_s16(&y02[i]);
xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
}
if (N - i >= 4) {
const int16x4_t x_s16x4 = vld1_s16(&x[i]);
const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
i += 4;
}
xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
for (; i < N; i++) {
xy01 = MAC16_16(xy01, x[i], y01[i]);
xy02 = MAC16_16(xy02, x[i], y02[i]);
}
*xy1 = xy01;
*xy2 = xy02;
#ifdef OPUS_CHECK_ASM
{
opus_val32 xy1_c, xy2_c;
dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
celt_assert(xy1_c == *xy1);
celt_assert(xy2_c == *xy2);
}
#endif
}
#else /* !FIXED_POINT */
/* ========================================================================== */
#ifdef OPUS_CHECK_ASM
/* This part of code simulates floating-point NEON operations. */
/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
/* operations of celt_inner_prod_neon(), and both functions should have bit */
/* exact output. */
static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
{
int i;
opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
for (i = 0; i < N - 3; i += 4) {
xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
}
xy0 += xy2;
xy1 += xy3;
xy = xy0 + xy1;
for (; i < N; i++) {
xy = MAC16_16(xy, x[i], y[i]);
}
return xy;
}
/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
/* operations of dual_inner_prod_neon(), and both functions should have bit */
/* exact output. */
static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
for (i = 0; i < N - 3; i += 4) {
xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
}
xy01_0 += xy01_2;
xy02_0 += xy02_2;
xy01_1 += xy01_3;
xy02_1 += xy02_3;
xy01 = xy01_0 + xy01_1;
xy02 = xy02_0 + xy02_1;
for (; i < N; i++) {
xy01 = MAC16_16(xy01, x[i], y01[i]);
xy02 = MAC16_16(xy02, x[i], y02[i]);
}
*xy1 = xy01;
*xy2 = xy02;
}
#endif /* OPUS_CHECK_ASM */
/* ========================================================================== */
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
{
int i;
opus_val32 xy;
float32x4_t xy_f32x4 = vdupq_n_f32(0);
float32x2_t xy_f32x2;
for (i = 0; i < N - 7; i += 8) {
float32x4_t x_f32x4, y_f32x4;
x_f32x4 = vld1q_f32(&x[i]);
y_f32x4 = vld1q_f32(&y[i]);
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
x_f32x4 = vld1q_f32(&x[i + 4]);
y_f32x4 = vld1q_f32(&y[i + 4]);
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
}
if (N - i >= 4) {
const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
i += 4;
}
xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
xy = vget_lane_f32(xy_f32x2, 0);
for (; i < N; i++) {
xy = MAC16_16(xy, x[i], y[i]);
}
#ifdef OPUS_CHECK_ASM
celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
#endif
return xy;
}
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
opus_val32 xy01, xy02;
float32x4_t xy01_f32x4 = vdupq_n_f32(0);
float32x4_t xy02_f32x4 = vdupq_n_f32(0);
float32x2_t xy01_f32x2, xy02_f32x2;
for (i = 0; i < N - 7; i += 8) {
float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
x_f32x4 = vld1q_f32(&x[i]);
y01_f32x4 = vld1q_f32(&y01[i]);
y02_f32x4 = vld1q_f32(&y02[i]);
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
x_f32x4 = vld1q_f32(&x[i + 4]);
y01_f32x4 = vld1q_f32(&y01[i + 4]);
y02_f32x4 = vld1q_f32(&y02[i + 4]);
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
}
if (N - i >= 4) {
const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
i += 4;
}
xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
xy01 = vget_lane_f32(xy01_f32x2, 0);
xy02 = vget_lane_f32(xy02_f32x2, 0);
for (; i < N; i++) {
xy01 = MAC16_16(xy01, x[i], y01[i]);
xy02 = MAC16_16(xy02, x[i], y02[i]);
}
*xy1 = xy01;
*xy2 = xy02;
#ifdef OPUS_CHECK_ASM
{
opus_val32 xy1_c, xy2_c;
dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
}
#endif
}
#endif /* FIXED_POINT */