gnupg/scripts/conf-w32brg/cipher/aescrypt.asm


; ---------------------------------------------------------------------------
; Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
; All rights reserved.
;
; LICENSE TERMS
;
; The free distribution and use of this software in both source and binary
; form is allowed (with or without changes) provided that:
;
;   1. distributions of this source code include the above copyright
;      notice, this list of conditions and the following disclaimer;
;
;   2. distributions in binary form include the above copyright
;      notice, this list of conditions and the following disclaimer
;      in the documentation and/or other associated materials;
;
;   3. the copyright holder's name is not used to endorse products
;      built using this software without specific written permission.
;
; ALTERNATIVELY, provided that this notice is retained in full, this product
; may be distributed under the terms of the GNU General Public License (GPL),
; in which case the provisions of the GPL apply INSTEAD OF those given above.
;
; DISCLAIMER
;
; This software is provided 'as is' with no explicit or implied warranties
; in respect of its properties, including, but not limited to, correctness
; and/or fitness for purpose.
; ---------------------------------------------------------------------------
; Issue Date: 1/06/2003

; An AES implementation for Pentium processors using the NASM assembler (see
; <http://sourceforge.net/projects/nasm>).This version provides the standard
; AES block length (128 bits, 16 bytes) with the same interface as that used
; in my C implementation.  The eax, ecx and edx registers and the artihmetic
; status flags are not preserved.   The ebx, esi, edi, and ebp registers are
; preserved across calls.  Only encryption and decryption are provided here,
; here, the key scheduling code being that in aeskey.c compiled with USE_ASM
; defined. This code uses the VC++ register saving conentions; if it is used
; with another compiler, its conventions for using and saving registers will
; need to be checked (and calling conventions).    The NASM command line for
; the VC++ custom build step is:
;
;    nasm -O2 -f win32 -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"

    section .text ; use32

; aes_rval aes_encrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
; aes_rval aes_decrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
;
; comment in/out the following lines to obtain the desired subroutines

%define ENCRYPTION  ; define if encryption is needed
%define DECRYPTION  ; define if decryption is needed

; The DLL interface must use the _stdcall convention in which the number
; of bytes of parameter space is added after an @ to the sutine's name.
; We must also remove our parameters from the stack before return (see
; the do_ret macro). Define AES_DLL for the Dynamic Link Library version.

;%define AES_DLL

tlen:   equ  1024   ; length of each of 4 'xor' arrays (256 32-bit words)

; offsets to parameters with one register pushed onto stack

in_blk: equ     4   ; input byte array address parameter
out_blk:equ     8   ; output byte array address parameter
ctx:    equ    12   ; AES context structure
stk_spc:equ    24   ; stack space

; register mapping for encrypt and decrypt subroutines

%define r0  eax
%define r1  ebx
%define r2  esi
%define r3  edi
%define r4  ecx
%define r5  edx
%define r6  ebp

%define eaxl  al
%define eaxh  ah
%define ebxl  bl
%define ebxh  bh
%define ecxl  cl
%define ecxh  ch
%define edxl  dl
%define edxh  dh

; These macros take a 32-bit word representing a column and use each
; of its 4 bytes to index a table of 256 32-bit words which are xored
; into each of the four output columns. The output values are in the
; registers %1, %2, %3 and %4 and the column input is in %5 with %6
; as a scratch register.

; Parameters:
;   %1  out_state[0]
;   %2  out_state[1]
;   %3  out_state[2]
;   %4  out_state[3]
;   %5  input register for the round (destroyed)
;   %6  scratch register for the round
;   %7  key schedule address for round (in form r6 + offset)

%macro do_fcol 8            ; first column forward round

    movzx   %6,%5l
    mov     %1,[%8]
    xor     %1,[4*%6+%7]
    movzx   %6,%5h
    shr     %5,16
    mov     %2,[%8+12]
    xor     %2,[4*%6+%7+tlen]
    movzx   %6,%5l
    mov     %3,[%8+ 8]
    xor     %3,[4*%6+%7+2*tlen]
    movzx   %6,%5h
    mov     %5,%4           ; save an input register value
    mov     %4,[%8+ 4]
    xor     %4,[4*%6+%7+3*tlen]

%endmacro

%macro do_icol 8            ; first column for inverse round

    movzx   %6,%5l
    mov     %1,[%8]
    xor     %1,[4*%6+%7]
    movzx   %6,%5h
    shr     %5,16
    mov     %2,[%8+ 4]
    xor     %2,[4*%6+%7+tlen]
    movzx   %6,%5l
    mov     %3,[%8+ 8]
    xor     %3,[4*%6+%7+2*tlen]
    movzx   %6,%5h
    mov     %5,%4           ; save an input register value
    mov     %4,[%8+12]
    xor     %4,[4*%6+%7+3*tlen]

%endmacro

%macro do_col   7           ; other columns for forward and inverse rounds

    movzx   %6,%5l
    xor     %1,[4*%6+%7]
    movzx   %6,%5h
    shr     %5,16
    xor     %2,[4*%6+%7+tlen]
    movzx   %6,%5l
    xor     %3,[4*%6+%7+2*tlen]
    movzx   %6,%5h
    xor     %4,[4*%6+%7+3*tlen]

%endmacro

; These macros implement stack based local variables

%macro  save 2
    mov     [esp+4*%1],%2
%endmacro

%macro  restore 2
    mov     %1,[esp+4*%2]
%endmacro

; This macro performs a forward encryption cycle. It is entered with
; the first previous round column values in r0, r1, r2 and r3 and
; exits with the final values in the same registers.

%macro fwd_rnd 1-2 _t_fn                ; normal forward rounds

    mov     r4,r0
    save    0,r2
    save    1,r3

; compute new column values

    do_fcol r0,r3,r2,r1, r4,r5, %2, %1  ; r4 = input r0
    do_col  r1,r0,r3,r2, r4,r5, %2      ; r4 = input r1 (saved in fcol_f)
    restore r4,0
    do_col  r2,r1,r0,r3, r4,r5, %2      ; r4 = input r2
    restore r4,1
    do_col  r3,r2,r1,r0, r4,r5, %2      ; r4 = input r3

%endmacro

; This macro performs an inverse encryption cycle. It is entered with
; the first previous round column values in r0, r1, r2 and r3 and
; exits with the final values in the same registers.

%macro inv_rnd 1-2 _t_in                ; normal inverse round

    mov     r4,r0
    save    0,r1
    save    1,r2

; compute new column values

    do_icol r0,r1,r2,r3, r4,r5, %2, %1  ; r4 = r0
    do_col  r3,r0,r1,r2, r4,r5, %2      ; r4 = r3 (saved in icol_f)
    restore r4,1
    do_col  r2,r3,r0,r1, r4,r5, %2      ; r4 = r2
    restore r4,0
    do_col  r1,r2,r3,r0, r4,r5, %2      ; r4 = r1

%endmacro

; the DLL has to implement the _stdcall calling interface on return
; In this case we have to take our parameters (3 4-byte pointers)
; off the stack

%macro  do_ret  0
%ifdef AES_DLL
    ret 12
%else
    ret
%endif
%endmacro

%macro  do_name 1
%ifndef AES_DLL
    global  %1
%1:
%else
    global  %1@12
    export  %1@12
%1@12:
%endif
%endmacro

; AES Encryption Subroutine

%ifdef  ENCRYPTION

    extern  _t_fn
    extern  _t_fl

    do_name _aes_encrypt

    sub     esp,stk_spc
    mov     [esp+20],ebp
    mov     [esp+16],ebx
    mov     [esp+12],esi
    mov     [esp+ 8],edi
    mov     r4,[esp+in_blk+stk_spc] ; input pointer
    mov     r6,[esp+ctx+stk_spc]    ; key pointer

; input four columns and xor in first round key

    mov     r0,[r4   ]
    mov     r1,[r4+ 4]
    xor     r0,[r6   ]
    xor     r1,[r6+ 4]
    mov     r2,[r4+ 8]
    mov     r3,[r4+12]
    xor     r2,[r6+ 8]
    xor     r3,[r6+12]

; determine the number of rounds

    mov     r4,[r6+4*45]
    mov     r5,[r6+4*52]
    xor     r4,[r6+4*53]
    xor     r4,r5
    je      .1
    cmp     r5,10
    je      .3
    cmp     r5,12
    je      .2
    mov     ebp,[esp+20]
    mov     ebx,[esp+16]
    mov     esi,[esp+12]
    mov     edi,[esp+ 8]
    lea     esp,[esp+stk_spc]
    mov     eax,-1
    do_ret

.1: fwd_rnd r6+ 16          ; 14 rounds for 256-bit key
    fwd_rnd r6+ 32
    lea     r6,[r6+32]
.2: fwd_rnd r6+ 16          ; 12 rounds for 192-bit key
    fwd_rnd r6+ 32
    lea     r6,[r6+32]
.3: fwd_rnd r6+ 16          ; 10 rounds for 128-bit key
    fwd_rnd r6+ 32
    fwd_rnd r6+ 48
    fwd_rnd r6+ 64
    fwd_rnd r6+ 80
    fwd_rnd r6+ 96
    fwd_rnd r6+112
    fwd_rnd r6+128
    fwd_rnd r6+144
    fwd_rnd r6+160, _t_fl   ; last round uses a different table

; move final values to the output array

    mov     r6,[esp+out_blk+stk_spc]
    mov     [r6+12],r3
    mov     [r6+8],r2
    mov     [r6+4],r1
    mov     [r6],r0
    mov     ebp,[esp+20]
    mov     ebx,[esp+16]
    mov     esi,[esp+12]
    mov     edi,[esp+ 8]
    lea     esp,[esp+stk_spc]
    xor     eax,eax
    do_ret

%endif

; AES Decryption Subroutine

%ifdef  DECRYPTION

    extern  _t_in
    extern  _t_il

    do_name _aes_decrypt

    sub     esp,stk_spc
    mov     [esp+20],ebp
    mov     [esp+16],ebx
    mov     [esp+12],esi
    mov     [esp+ 8],edi
    mov     r4,[esp+in_blk+stk_spc] ; input pointer
    mov     r6,[esp+ctx+stk_spc]    ; context pointer

; input four columns

    mov     r0,[r4]
    mov     r1,[r4+4]
    mov     r2,[r4+8]
    mov     r3,[r4+12]

; determine the number of rounds

    mov     r5,[r6+4*52]
    mov     r4,[r6+4*45]
    xor     r4,[r6+4*53]
    xor     r4,r5
    jne     .1
    mov     r5,14

; xor in initial keys

.1: lea     r4,[4*r5]
    xor     r0,[r6+4*r4   ]
    xor     r1,[r6+4*r4+ 4]
    xor     r2,[r6+4*r4+ 8]
    xor     r3,[r6+4*r4+12]
    cmp     r5,10
    je      .3
    cmp     r5,12
    je      .2
    cmp     r5,14
    jne     .4

    inv_rnd r6+208          ; 14 rounds for 256-bit key
    inv_rnd r6+192
.2: inv_rnd r6+176          ; 12 rounds for 192-bit key
    inv_rnd r6+160
.3: inv_rnd r6+144          ; 10 rounds for 128-bit key
    inv_rnd r6+128
    inv_rnd r6+112
    inv_rnd r6+ 96
    inv_rnd r6+ 80
    inv_rnd r6+ 64
    inv_rnd r6+ 48
    inv_rnd r6+ 32
    inv_rnd r6+ 16
    inv_rnd r6, _t_il       ; last round uses a different table

; move final values to the output array.

    mov     r6,[esp+out_blk+stk_spc]
    mov     [r6+12],r3
    mov     [r6+8],r2
    mov     [r6+4],r1
    mov     [r6],r0
    mov     ebp,[esp+20]
    mov     ebx,[esp+16]
    mov     esi,[esp+12]
    mov     edi,[esp+ 8]
    lea     esp,[esp+stk_spc]
    xor     eax,eax
    do_ret

.4: mov     ebp,[esp+20]
    mov     ebx,[esp+16]
    mov     esi,[esp+12]
    mov     edi,[esp+ 8]
    lea     esp,[esp+stk_spc]
    mov     eax,-1
    do_ret

%endif

    end
* zh_TW.po, fi.po: Updated from upstream. * be.po: New. * de.po: Updated. * ru.po: Added. New version by Maxim Britov. 2003-12-03 15:18:42 +00:00
			`; ---------------------------------------------------------------------------`
			`; Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.`
			`; All rights reserved.`
			`;`
			`; LICENSE TERMS`
			`;`
			`; The free distribution and use of this software in both source and binary`
			`; form is allowed (with or without changes) provided that:`
			`;`
			`; 1. distributions of this source code include the above copyright`
			`; notice, this list of conditions and the following disclaimer;`
			`;`
			`; 2. distributions in binary form include the above copyright`
			`; notice, this list of conditions and the following disclaimer`
			`; in the documentation and/or other associated materials;`
			`;`
			`; 3. the copyright holder's name is not used to endorse products`
			`; built using this software without specific written permission.`
			`;`
			`; ALTERNATIVELY, provided that this notice is retained in full, this product`
			`; may be distributed under the terms of the GNU General Public License (GPL),`
			`; in which case the provisions of the GPL apply INSTEAD OF those given above.`
			`;`
			`; DISCLAIMER`
			`;`
			`; This software is provided 'as is' with no explicit or implied warranties`
			`; in respect of its properties, including, but not limited to, correctness`
			`; and/or fitness for purpose.`
			`; ---------------------------------------------------------------------------`
			`; Issue Date: 1/06/2003`

			`; An AES implementation for Pentium processors using the NASM assembler (see`
			`; <http://sourceforge.net/projects/nasm>).This version provides the standard`
			`; AES block length (128 bits, 16 bytes) with the same interface as that used`
			`; in my C implementation. The eax, ecx and edx registers and the artihmetic`
			`; status flags are not preserved. The ebx, esi, edi, and ebp registers are`
			`; preserved across calls. Only encryption and decryption are provided here,`
			`; here, the key scheduling code being that in aeskey.c compiled with USE_ASM`
			`; defined. This code uses the VC++ register saving conentions; if it is used`
			`; with another compiler, its conventions for using and saving registers will`
			`; need to be checked (and calling conventions). The NASM command line for`
			`; the VC++ custom build step is:`
			`;`
			`; nasm -O2 -f win32 -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"`

			`section .text ; use32`

			`; aes_rval aes_encrypt(const unsigned char in_blk[],`
			`; unsigned char out_blk[], const aes_encrypt_ctx cx[1]);`
			`; aes_rval aes_decrypt(const unsigned char in_blk[],`
			`; unsigned char out_blk[], const aes_decrypt_ctx cx[1]);`
			`;`
			`; comment in/out the following lines to obtain the desired subroutines`

			`%define ENCRYPTION ; define if encryption is needed`
			`%define DECRYPTION ; define if decryption is needed`

			`; The DLL interface must use the _stdcall convention in which the number`
			`; of bytes of parameter space is added after an @ to the sutine's name.`
			`; We must also remove our parameters from the stack before return (see`
			`; the do_ret macro). Define AES_DLL for the Dynamic Link Library version.`

			`;%define AES_DLL`

			`tlen: equ 1024 ; length of each of 4 'xor' arrays (256 32-bit words)`

			`; offsets to parameters with one register pushed onto stack`

			`in_blk: equ 4 ; input byte array address parameter`
			`out_blk:equ 8 ; output byte array address parameter`
			`ctx: equ 12 ; AES context structure`
			`stk_spc:equ 24 ; stack space`

			`; register mapping for encrypt and decrypt subroutines`

			`%define r0 eax`
			`%define r1 ebx`
			`%define r2 esi`
			`%define r3 edi`
			`%define r4 ecx`
			`%define r5 edx`
			`%define r6 ebp`

			`%define eaxl al`
			`%define eaxh ah`
			`%define ebxl bl`
			`%define ebxh bh`
			`%define ecxl cl`
			`%define ecxh ch`
			`%define edxl dl`
			`%define edxh dh`

			`; These macros take a 32-bit word representing a column and use each`
			`; of its 4 bytes to index a table of 256 32-bit words which are xored`
			`; into each of the four output columns. The output values are in the`
			`; registers %1, %2, %3 and %4 and the column input is in %5 with %6`
			`; as a scratch register.`

			`; Parameters:`
			`; %1 out_state[0]`
			`; %2 out_state[1]`
			`; %3 out_state[2]`
			`; %4 out_state[3]`
			`; %5 input register for the round (destroyed)`
			`; %6 scratch register for the round`
			`; %7 key schedule address for round (in form r6 + offset)`

			`%macro do_fcol 8 ; first column forward round`

			`movzx %6,%5l`
			`mov %1,[%8]`
			`xor %1,[4*%6+%7]`
			`movzx %6,%5h`
			`shr %5,16`
			`mov %2,[%8+12]`
			`xor %2,[4*%6+%7+tlen]`
			`movzx %6,%5l`
			`mov %3,[%8+ 8]`
			`xor %3,[4%6+%7+2tlen]`
			`movzx %6,%5h`
			`mov %5,%4 ; save an input register value`
			`mov %4,[%8+ 4]`
			`xor %4,[4%6+%7+3tlen]`

			`%endmacro`

			`%macro do_icol 8 ; first column for inverse round`

			`movzx %6,%5l`
			`mov %1,[%8]`
			`xor %1,[4*%6+%7]`
			`movzx %6,%5h`
			`shr %5,16`
			`mov %2,[%8+ 4]`
			`xor %2,[4*%6+%7+tlen]`
			`movzx %6,%5l`
			`mov %3,[%8+ 8]`
			`xor %3,[4%6+%7+2tlen]`
			`movzx %6,%5h`
			`mov %5,%4 ; save an input register value`
			`mov %4,[%8+12]`
			`xor %4,[4%6+%7+3tlen]`

			`%endmacro`

			`%macro do_col 7 ; other columns for forward and inverse rounds`

			`movzx %6,%5l`
			`xor %1,[4*%6+%7]`
			`movzx %6,%5h`
			`shr %5,16`
			`xor %2,[4*%6+%7+tlen]`
			`movzx %6,%5l`
			`xor %3,[4%6+%7+2tlen]`
			`movzx %6,%5h`
			`xor %4,[4%6+%7+3tlen]`

			`%endmacro`

			`; These macros implement stack based local variables`

			`%macro save 2`
			`mov [esp+4*%1],%2`
			`%endmacro`

			`%macro restore 2`
			`mov %1,[esp+4*%2]`
			`%endmacro`

			`; This macro performs a forward encryption cycle. It is entered with`
			`; the first previous round column values in r0, r1, r2 and r3 and`
			`; exits with the final values in the same registers.`

			`%macro fwd_rnd 1-2 _t_fn ; normal forward rounds`

			`mov r4,r0`
			`save 0,r2`
			`save 1,r3`

			`; compute new column values`

			`do_fcol r0,r3,r2,r1, r4,r5, %2, %1 ; r4 = input r0`
			`do_col r1,r0,r3,r2, r4,r5, %2 ; r4 = input r1 (saved in fcol_f)`
			`restore r4,0`
			`do_col r2,r1,r0,r3, r4,r5, %2 ; r4 = input r2`
			`restore r4,1`
			`do_col r3,r2,r1,r0, r4,r5, %2 ; r4 = input r3`

			`%endmacro`

			`; This macro performs an inverse encryption cycle. It is entered with`
			`; the first previous round column values in r0, r1, r2 and r3 and`
			`; exits with the final values in the same registers.`

			`%macro inv_rnd 1-2 _t_in ; normal inverse round`

			`mov r4,r0`
			`save 0,r1`
			`save 1,r2`

			`; compute new column values`

			`do_icol r0,r1,r2,r3, r4,r5, %2, %1 ; r4 = r0`
			`do_col r3,r0,r1,r2, r4,r5, %2 ; r4 = r3 (saved in icol_f)`
			`restore r4,1`
			`do_col r2,r3,r0,r1, r4,r5, %2 ; r4 = r2`
			`restore r4,0`
			`do_col r1,r2,r3,r0, r4,r5, %2 ; r4 = r1`

			`%endmacro`

			`; the DLL has to implement the _stdcall calling interface on return`
			`; In this case we have to take our parameters (3 4-byte pointers)`
			`; off the stack`

			`%macro do_ret 0`
			`%ifdef AES_DLL`
			`ret 12`
			`%else`
			`ret`
			`%endif`
			`%endmacro`

			`%macro do_name 1`
			`%ifndef AES_DLL`
			`global %1`
			`%1:`
			`%else`
			`global %1@12`
			`export %1@12`
			`%1@12:`
			`%endif`
			`%endmacro`

			`; AES Encryption Subroutine`

			`%ifdef ENCRYPTION`

			`extern _t_fn`
			`extern _t_fl`

			`do_name _aes_encrypt`

			`sub esp,stk_spc`
			`mov [esp+20],ebp`
			`mov [esp+16],ebx`
			`mov [esp+12],esi`
			`mov [esp+ 8],edi`
			`mov r4,[esp+in_blk+stk_spc] ; input pointer`
			`mov r6,[esp+ctx+stk_spc] ; key pointer`

			`; input four columns and xor in first round key`

			`mov r0,[r4 ]`
			`mov r1,[r4+ 4]`
			`xor r0,[r6 ]`
			`xor r1,[r6+ 4]`
			`mov r2,[r4+ 8]`
			`mov r3,[r4+12]`
			`xor r2,[r6+ 8]`
			`xor r3,[r6+12]`

			`; determine the number of rounds`

			`mov r4,[r6+4*45]`
			`mov r5,[r6+4*52]`
			`xor r4,[r6+4*53]`
			`xor r4,r5`
			`je .1`
			`cmp r5,10`
			`je .3`
			`cmp r5,12`
			`je .2`
			`mov ebp,[esp+20]`
			`mov ebx,[esp+16]`
			`mov esi,[esp+12]`
			`mov edi,[esp+ 8]`
			`lea esp,[esp+stk_spc]`
			`mov eax,-1`
			`do_ret`

			`.1: fwd_rnd r6+ 16 ; 14 rounds for 256-bit key`
			`fwd_rnd r6+ 32`
			`lea r6,[r6+32]`
			`.2: fwd_rnd r6+ 16 ; 12 rounds for 192-bit key`
			`fwd_rnd r6+ 32`
			`lea r6,[r6+32]`
			`.3: fwd_rnd r6+ 16 ; 10 rounds for 128-bit key`
			`fwd_rnd r6+ 32`
			`fwd_rnd r6+ 48`
			`fwd_rnd r6+ 64`
			`fwd_rnd r6+ 80`
			`fwd_rnd r6+ 96`
			`fwd_rnd r6+112`
			`fwd_rnd r6+128`
			`fwd_rnd r6+144`
			`fwd_rnd r6+160, _t_fl ; last round uses a different table`

			`; move final values to the output array`

			`mov r6,[esp+out_blk+stk_spc]`
			`mov [r6+12],r3`
			`mov [r6+8],r2`
			`mov [r6+4],r1`
			`mov [r6],r0`
			`mov ebp,[esp+20]`
			`mov ebx,[esp+16]`
			`mov esi,[esp+12]`
			`mov edi,[esp+ 8]`
			`lea esp,[esp+stk_spc]`
			`xor eax,eax`
			`do_ret`

			`%endif`

			`; AES Decryption Subroutine`

			`%ifdef DECRYPTION`

			`extern _t_in`
			`extern _t_il`

			`do_name _aes_decrypt`

			`sub esp,stk_spc`
			`mov [esp+20],ebp`
			`mov [esp+16],ebx`
			`mov [esp+12],esi`
			`mov [esp+ 8],edi`
			`mov r4,[esp+in_blk+stk_spc] ; input pointer`
			`mov r6,[esp+ctx+stk_spc] ; context pointer`

			`; input four columns`

			`mov r0,[r4]`
			`mov r1,[r4+4]`
			`mov r2,[r4+8]`
			`mov r3,[r4+12]`

			`; determine the number of rounds`

			`mov r5,[r6+4*52]`
			`mov r4,[r6+4*45]`
			`xor r4,[r6+4*53]`
			`xor r4,r5`
			`jne .1`
			`mov r5,14`

			`; xor in initial keys`

			`.1: lea r4,[4*r5]`
			`xor r0,[r6+4*r4 ]`
			`xor r1,[r6+4*r4+ 4]`
			`xor r2,[r6+4*r4+ 8]`
			`xor r3,[r6+4*r4+12]`
			`cmp r5,10`
			`je .3`
			`cmp r5,12`
			`je .2`
			`cmp r5,14`
			`jne .4`

			`inv_rnd r6+208 ; 14 rounds for 256-bit key`
			`inv_rnd r6+192`
			`.2: inv_rnd r6+176 ; 12 rounds for 192-bit key`
			`inv_rnd r6+160`
			`.3: inv_rnd r6+144 ; 10 rounds for 128-bit key`
			`inv_rnd r6+128`
			`inv_rnd r6+112`
			`inv_rnd r6+ 96`
			`inv_rnd r6+ 80`
			`inv_rnd r6+ 64`
			`inv_rnd r6+ 48`
			`inv_rnd r6+ 32`
			`inv_rnd r6+ 16`
			`inv_rnd r6, _t_il ; last round uses a different table`

			`; move final values to the output array.`

			`mov r6,[esp+out_blk+stk_spc]`
			`mov [r6+12],r3`
			`mov [r6+8],r2`
			`mov [r6+4],r1`
			`mov [r6],r0`
			`mov ebp,[esp+20]`
			`mov ebx,[esp+16]`
			`mov esi,[esp+12]`
			`mov edi,[esp+ 8]`
			`lea esp,[esp+stk_spc]`
			`xor eax,eax`
			`do_ret`

			`.4: mov ebp,[esp+20]`
			`mov ebx,[esp+16]`
			`mov esi,[esp+12]`
			`mov edi,[esp+ 8]`
			`lea esp,[esp+stk_spc]`
			`mov eax,-1`
			`do_ret`

			`%endif`

			`end`