[arm-gnu] vld4 / vst4 NEON intrinsics
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[arm-gnu] vld4 / vst4 NEON intrinsics
- To: arm-gnu@xxxxxxxxxxxxxxxx
- Subject: [arm-gnu] vld4 / vst4 NEON intrinsics
- From: Samuel Rødal <samuel.rodal@xxxxxxxxx>
- Date: Thu, 07 Jan 2010 18:32:28 +0100
The vldX and vstX variation of NEON intrinsics, where X > 1, seem to
cause the compiler to generate an obscene amount of code.
Example:
void blend1(uint8_t *src, uint8_t *dst)
{
uint8x8_t temp = vld1_u8(src);
vst1_u8(dst, temp);
}
generates the sensible
vld1.8 {d16}, [r0]
vst1.8 {d16}, [r1]
bx lr
Whereas:
void blend4(uint8_t *src, uint8_t *dst)
{
uint8x8x4_t temp = vld4_u8(src);
vst4_u8(dst, temp);
}
generates
stmfd sp!, {r4, r5, r6}
.save {r4, r5, r6}
.LCFI4:
.pad #132
sub sp, sp, #132
.LCFI5:
vld4.8 {d16-d19}, [r0]
add r6, sp, #64
vstmia r6, {d16-d19}
mov r5, r1
ldmia r6!, {r0, r1, r2, r3}
add ip, sp, #96
mov r4, ip
stmia r4!, {r0, r1, r2, r3}
ldmia r6, {r0, r1, r2, r3}
stmia r4, {r0, r1, r2, r3}
ldmia ip!, {r0, r1, r2, r3}
add ip, sp, #32
mov r6, ip
stmia r6!, {r0, r1, r2, r3}
ldmia r4, {r0, r1, r2, r3}
stmia r6, {r0, r1, r2, r3}
ldmia ip!, {r0, r1, r2, r3}
mov r4, sp
stmia r4!, {r0, r1, r2, r3}
ldmia r6, {r0, r1, r2, r3}
stmia r4, {r0, r1, r2, r3}
vldmia sp, {d16-d19}
vst4.8 {d16-d19}, [r5]
add sp, sp, #132
ldmfd sp!, {r4, r5, r6}
bx lr
Compile flags used were "-mfloat-abi=softfp -mfpu=neon -O3". Any ideas
or workarounds?
Regards,
Samuel