op_mul_long.S revision 1452bee8f06b9f76a333ddf4760e4beaa82f8099
11452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    /*
21452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * Signed 64-bit integer multiply.
31452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *
41452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * Consider WXxYZ (r1r0 x r3r2) with a long multiply:
51452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *        WX
61452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *      x YZ
71452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *  --------
81452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *     ZW ZX
91452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *  YW YX
101452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *
111452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * The low word of the result holds ZX, the high word holds
121452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * (ZW+YX) + (the high overflow from ZX).  YW doesn't matter because
131452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * it doesn't fit in the low 64 bits.
141452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     *
151452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * Unlike most ARM math operations, multiply instructions have
161452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * restrictions on using the same register more than once (Rd and Rm
171452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     * cannot be the same).
181452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee     */
191452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    /* mul-long vAA, vBB, vCC */
201452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    FETCH r0, 1                         @ r0<- CCBB
211452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    and     r2, r0, #255                @ r2<- BB
221452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    mov     r3, r0, lsr #8              @ r3<- CC
231452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    add     r2, rFP, r2, lsl #2         @ r2<- &fp[BB]
241452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    add     r3, rFP, r3, lsl #2         @ r3<- &fp[CC]
251452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
261452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
271452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    mul     ip, r2, r1                  @  ip<- ZxW
281452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
291452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
301452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    mov     r0, rINST, lsr #8           @ r0<- AA
311452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
321452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    add     r0, rFP, r0, lsl #2         @ r0<- &fp[AA]
331452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
341452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    GET_INST_OPCODE ip                  @ extract opcode from rINST
351452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
361452bee8f06b9f76a333ddf4760e4beaa82f8099buzbee    GOTO_OPCODE ip                      @ jump to next instruction
37