嗯,我写了一下,不知道对不对,因为psp现在没电了,没法测试
#define vfpuadd16
__asm__ volatile(
"lv.q R100, %0
"
"lv.q R101, %1
"
"lv.q R102, %2
"
"lv.q R103, %3
"
"lv.q R000, %4
"
"lv.q R001, %5
"
"lv.q R002, %6
"
"lv.q R003, %7
"
"vadd.q R100, R100, R000
"
"vadd.q R101, R101, R001
"
"vadd.q R102, R102, R002
"
"vadd.q R103, R103, R003
"
"sv.q R100, %0
"
"sv.q R101, %1
"
"sv.q R102, %2
"
"sv.q R103, %3
"
: "+m" (blockM32[4*0]),
"+m" (blockM32[4*1]),
"+m" (blockM32[4*2]),
"+m" (blockM32[4*3]),
"+m" (blockAdapt32[4*0]),
"+m" (blockAdapt32[4*1]),
"+m" (blockAdapt32[4*2]),
"+m" (blockAdapt32[4*3]) ) ;
#define vfpusub16
__asm__ volatile(
"lv.q R100, %0
"
"lv.q R101, %1
"
"lv.q R102, %2
"
"lv.q R103, %3
"
"lv.q R000, %4
"
"lv.q R001, %5
"
"lv.q R002, %6
"
"lv.q R003, %7
"
"vsub.q R100, R100, R000
"
"vsub.q R101, R101, R001
"
"vsub.q R102, R102, R002
"
"vsub.q R103, R103, R003
"
"sv.q R100, %0
"
"sv.q R101, %1
"
"sv.q R102, %2
"
"sv.q R103, %3
"
: "+m" (blockM32[4*0]),
"+m" (blockM32[4*1]),
"+m" (blockM32[4*2]),
"+m" (blockM32[4*3]),
"+m" (blockAdapt32[4*0]),
"+m" (blockAdapt32[4*1]),
"+m" (blockAdapt32[4*2]),
"+m" (blockAdapt32[4*3]) ) ;
static inline void AdaptVFPUAdd(short * pM, const short * pAdapt) {
float blockM32[16];
float blockAdapt32[16];
for(int i = 0; i < 16; i++)
{
blockM32[i] = *(pM+i);
blockAdapt32[i] = *(pAdapt+i);
}
vfpuadd16;
for(int i = 0; i < 16; i++)
{
*(pM+i) = (short)blockM32[i];
}
}
static inline void AdaptVFPUSub(short * pM, const short * pAdapt) {
float blockM32[16];
float blockAdapt32[16];
for(int i = 0; i < 16; i++)
{
blockM32[i] = *(pM+i);
blockAdapt32[i] = *(pAdapt+i);
}
vfpusub16;
for(int i = 0; i < 16; i++)
{
*(pM+i) = (short)blockM32[i];
}
}
void Adapt(short * pM, const short * pAdapt, int nDirection, int nOrder)
{
nOrder >>= 4;
if (nDirection < 0)
{
while (nOrder--)
{
AdaptVFPUAdd(pM, pAdapt);
pM+=16;
pAdapt+=16;
//EXPAND_16_TIMES(*pM++ += *pAdapt++;)
}
}
else if (nDirection > 0)
{
while (nOrder--)
{
AdaptVFPUSub(pM, pAdapt);
pM+=16;
pAdapt+=16;
//EXPAND_16_TIMES(*pM++ -= *pAdapt++;)
}
}
}