keycard-pro/app/crypto/aes/CM3_1T/CM3_1T_AES_decrypt.S

204 lines
4.0 KiB
ArmAsm

/*!
* \file CM3_1T_AES_decrypt.S
* \brief
*
*
* \author Jan Oleksiewicz <jnk0le@hotmail.com>
* Peter Schwabe & Ko Stoffelen @2016
* \license SPDX-License-Identifier: MIT
* \date 9 jun 2018
*/
.syntax unified
.thumb
.text
.align 3
// void CM3_1T_AES_decrypt(uint8_t* rk, const uint8_t* in, uint8_t* out, size_t rounds) {
.global CM3_1T_AES_decrypt
.type CM3_1T_AES_decrypt,%function
CM3_1T_AES_decrypt:
#if __ARM_ARCH_7M__ || __ARM_ARCH_7EM__ || __ARM_ARCH_8M_MAIN__
adds r0, #16 //to compare against before final round
push {r0, r2, r4-r11, lr} //stack rk+16, out
movw r14, #:lower16:AES_Td2
movt r14, #:upper16:AES_Td2
//rk_end = rk+16 + rounds * 16
add r12, r0, r3, lsl #4
//load input
ldmia r1!, {r4-r7}
//load initial round key
ldmdb r12!, {r0-r3}
//initial addroundkey
eors r4, r0
eors r5, r1
eors r6, r2
eors r7, r3
1: uxtb r0, r4
uxtb r1, r5
uxtb r2, r6
uxtb r3, r7
#if __ARM_ARCH_7EM__ || __ARM_ARCH_8M_MAIN__
// aggregate loads by source in case it lies in different memory blocks
ldr r0, [r14, r0, lsl #2]
ldr r1, [r14, r1, lsl #2]
ldr r2, [r14, r2, lsl #2]
ldr r3, [r14, r3, lsl #2]
ldr r9, [r12, #-12]
ldr r10, [r12, #-8]
ldr r11, [r12, #-4]
ldr r8, [r12, #-16]!
#else // cm3 can't pre index anywhere but first load
ldr r8, [r12, #-16]!
ldr r0, [r14, r0, lsl #2]
ldr r1, [r14, r1, lsl #2]
ldr r2, [r14, r2, lsl #2]
ldr r3, [r14, r3, lsl #2]
ldr r9, [r12, #4]
ldr r10, [r12, #8]
ldr r11, [r12, #12]
#endif
eor r8, r8, r0, ror #16
eor r9, r9, r1, ror #16
eor r10, r10, r2, ror #16
eor r11, r11, r3, ror #16
uxtb r0, r7, ror #8
uxtb r1, r4, ror #8
uxtb r2, r5, ror #8
uxtb r3, r6, ror #8
ldr r0, [r14, r0, lsl #2]
ldr r1, [r14, r1, lsl #2]
ldr r2, [r14, r2, lsl #2]
ldr r3, [r14, r3, lsl #2]
eor r8, r8, r0, ror #8
eor r9, r9, r1, ror #8
eor r10, r10, r2, ror #8
eor r11, r11, r3, ror #8
uxtb r0, r6, ror #16
uxtb r1, r7, ror #16
uxtb r2, r4, ror #16
uxtb r3, r5, ror #16
lsrs r5, #24
lsrs r6, #24
lsrs r7, #24
lsrs r4, #24
ldr r0, [r14, r0, lsl #2]
ldr r1, [r14, r1, lsl #2]
ldr r2, [r14, r2, lsl #2]
ldr r3, [r14, r3, lsl #2]
ldr r5, [r14, r5, lsl #2]
ldr r6, [r14, r6, lsl #2]
ldr r7, [r14, r7, lsl #2]
ldr r4, [r14, r4, lsl #2]
// change xoring order to writeback r4-r7 without extra moves
eor r0, r0, r5, ror #24
eor r1, r1, r6, ror #24
// set flags early to optimize speculative fetches in cm3
// cmp have to be close to branch, otherwise speculative code loads doesn't work
ldr r5, [sp]
cmp r5, r12
eor r2, r2, r7, ror #24
eor r3, r3, r4, ror #24
eor r4, r8, r0
eor r5, r9, r1
eor r6, r10, r2
eor r7, r11, r3
bne.w 1b //align following code to 4 bytes
// final round
movw r14, #:lower16:AES_inv_sbox
movt r14, #:upper16:AES_inv_sbox
uxtb r0, r4
uxtb r1, r5
uxtb r2, r6
uxtb r3, r7
ldr r8, [r12, #-16]
ldr r9, [r12, #-12]
ldr r10, [r12, #-8]
ldr r11, [r12, #-4]
ldrb r0, [r14, r0]
ldrb r1, [r14, r1]
ldrb r2, [r14, r2]
ldrb r3, [r14, r3]
eor r8, r0
eor r9, r1
eor r10, r2
eor r11, r3
uxtb r0, r7, ror #8
uxtb r1, r4, ror #8
uxtb r2, r5, ror #8
uxtb r3, r6, ror #8
ldrb r0, [r14, r0]
ldrb r1, [r14, r1]
ldrb r2, [r14, r2]
ldrb r3, [r14, r3]
eor r8, r8, r0, lsl #8
eor r9, r9, r1, lsl #8
eor r10, r10, r2, lsl #8
eor r11, r11, r3, lsl #8
uxtb r0, r6, ror #16
uxtb r1, r7, ror #16
uxtb r2, r4, ror #16
uxtb r3, r5, ror #16
lsrs r5, #24
lsrs r6, #24
lsrs r7, #24
uxtb r12, r4, ror #24
ldr r4, [sp, #4] // load output pointer
ldrb r0, [r14, r0]
ldrb r1, [r14, r1]
ldrb r2, [r14, r2]
ldrb r3, [r14, r3]
ldrb r5, [r14, r5]
ldrb r6, [r14, r6]
ldrb r7, [r14, r7]
ldrb r12, [r14, r12]
eor r8, r8, r0, lsl #16
eor r9, r9, r1, lsl #16
eor r10, r10, r2, lsl #16
eor r11, r11, r3, lsl #16
eor r0, r8, r5, lsl #24
eor r1, r9, r6, lsl #24
eor r2, r10, r7, lsl #24
eor r3, r11, r12, lsl #24
add sp, #8 //less mem pressure than preindexed load + dummy pop
str r0, [r4, #0]
str r1, [r4, #4]
str r2, [r4, #8]
str r3, [r4, #12]
pop {r4-r11, pc}
#else
b . // crash in case the function was called on thumb1 core
#endif