/*
* fuc microcode for nv98 pcrypt engine
* Copyright (C) 2010 Marcin Kościelnicki
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
.section #nv98_pcrypt_data
ctx_dma:
ctx_dma_query: .b32 0
ctx_dma_src: .b32 0
ctx_dma_dst: .b32 0
.equ #dma_count 3
ctx_query_address_high: .b32 0
ctx_query_address_low: .b32 0
ctx_query_counter: .b32 0
ctx_cond_address_high: .b32 0
ctx_cond_address_low: .b32 0
ctx_cond_off: .b32 0
ctx_src_address_high: .b32 0
ctx_src_address_low: .b32 0
ctx_dst_address_high: .b32 0
ctx_dst_address_low: .b32 0
ctx_mode: .b32 0
.align 16
ctx_key: .skip 16
ctx_iv: .skip 16
.align 0x80
swap:
.skip 32
.align 8
common_cmd_dtable:
.b32 #ctx_query_address_high + 0x20000 ~0xff
.b32 #ctx_query_address_low + 0x20000 ~0xfffffff0
.b32 #ctx_query_counter + 0x20000 ~0xffffffff
.b32 #cmd_query_get + 0x00000 ~1
.b32 #ctx_cond_address_high + 0x20000 ~0xff
.b32 #ctx_cond_address_low + 0x20000 ~0xfffffff0
.b32 #cmd_cond_mode + 0x00000 ~7
.b32 #cmd_wrcache_flush + 0x00000 ~0
.equ #common_cmd_max 0x88
.align 8
engine_cmd_dtable:
.b32 #ctx_key + 0x0 + 0x20000 ~0xffffffff
.b32 #ctx_key + 0x4 + 0x20000 ~0xffffffff
.b32 #ctx_key + 0x8 + 0x20000 ~0xffffffff
.b32 #ctx_key + 0xc + 0x20000 ~0xffffffff
.b32 #ctx_iv + 0x0 + 0x20000 ~0xffffffff
.b32 #ctx_iv + 0x4 + 0x20000 ~0xffffffff
.b32 #ctx_iv + 0x8 + 0x20000 ~0xffffffff
.b32 #ctx_iv + 0xc + 0x20000 ~0xffffffff
.b32 #ctx_src_address_high + 0x20000 ~0xff
.b32 #ctx_src_address_low + 0x20000 ~0xfffffff0
.b32 #ctx_dst_address_high + 0x20000 ~0xff
.b32 #ctx_dst_address_low + 0x20000 ~0xfffffff0
.b32 #crypt_cmd_mode + 0x00000 ~0xf
.b32 #crypt_cmd_length + 0x10000 ~0x0ffffff0
.equ #engine_cmd_max 0xce
.align 4
crypt_dtable:
.b16 #crypt_copy_prep #crypt_do_inout
.b16 #crypt_store_prep #crypt_do_out
.b16 #crypt_ecb_e_prep #crypt_do_inout
.b16 #crypt_ecb_d_prep #crypt_do_inout
.b16 #crypt_cbc_e_prep #crypt_do_inout
.b16 #crypt_cbc_d_prep #crypt_do_inout
.b16 #crypt_pcbc_e_prep #crypt_do_inout
.b16 #crypt_pcbc_d_prep #crypt_do_inout
.b16 #crypt_cfb_e_prep #crypt_do_inout
.b16 #crypt_cfb_d_prep #crypt_do_inout
.b16 #crypt_ofb_prep #crypt_do_inout
.b16 #crypt_ctr_prep #crypt_do_inout
.b16 #crypt_cbc_mac_prep #crypt_do_in
.b16 #crypt_cmac_finish_complete_prep #crypt_do_in
.b16 #crypt_cmac_finish_partial_prep #crypt_do_in
.align 0x100
.section #nv98_pcrypt_code
// $r0 is always set to 0 in our code - this allows some space savings.
clear b32 $r0
// set up the interrupt handler
mov $r1 #ih
mov $iv0 $r1
// init stack pointer
mov $sp $r0
// set interrupt dispatch - route timer, fifo, ctxswitch to i0, others to host
movw $r1 0xfff0
sethi $r1 0
mov $r2 0x400
iowr I[$r2 + 0x300] $r1
// enable the interrupts
or $r1 0xc
iowr I[$r2] $r1
// enable fifo access and context switching
mov $r1 3
mov $r2 0x1200
iowr I[$r2] $r1
// enable i0 delivery
bset $flags ie0
// sleep forver, waking only for interrupts.
bset $flags $p0
spin:
sleep $p0
bra #spin
// i0 handler
ih:
// see which interrupts we got
iord $r1 I[$r0 + 0x200]
and $r2 $r1 0x8
cmpu b32 $r2 0
bra e #noctx
// context switch... prepare the regs for xfer
mov $r2 0x7700
mov $xtargets $r2
mov $xdbase $r0
// 128-byte context.
mov $r2 0
sethi $r2 0x50000
// read current channel
mov $r3 0x1400
iord $r4 I[$r3]
// if bit 30 set, it's active, so we have to unload it first.
shl b32 $r5 $r4 1
cmps b32 $r5 0
bra nc #ctxload
// unload the current channel - save the context
xdst $r0 $r2
xdwait
// and clear bit 30, then write back
bclr $r4 0x1e
iowr I[$r3] $r4
// tell PFIFO we unloaded
mov $r4 1
iowr I[$r3 + 0x200] $r4
bra #noctx
ctxload:
// no channel loaded - perhaps we're requested to load one
iord $r4 I[$r3 + 0x100]
shl b32 $r15 $r4 1
cmps b32 $r15 0
// if bit 30 of next channel not set, probably PFIFO is just
// killing a context. do a faux load, without the active bit.
bra nc #dummyload
// ok, do a real context load.
xdld $r0 $r2
xdwait
mov $r5 #ctx_dma
mov $r6 #dma_count - 1
ctxload_dma_loop:
ld b32 $r7 D[$r5 + $r6 * 4]
add b32 $r8 $r6 0x180
shl b32 $r8 8
iowr I[$r8] $r7
sub b32 $r6 1
bra nc #ctxload_dma_loop
dummyload:
// tell PFIFO we're done
mov $r5 2
iowr I[$r3 + 0x200] $r5
noctx:
and $r2 $r1 0x4
cmpu b32 $r2 0
bra e #nocmd
// incoming fifo command.
mov $r3 0x1900
iord $r2 I[$r3 + 0x100]
iord $r3 I[$r3]
// extract the method
and $r4 $r2 0x7ff
// shift the addr to proper position if we need to interrupt later
shl b32 $r2 0x10
// mthd 0 and 0x100 [NAME, NOP]: ignore
and $r5 $r4 0x7bf
cmpu b32 $r5 0
bra e #cmddone
mov $r5 #engine_cmd_dtable - 0xc0 * 8
mov $r6 #engine_cmd_max
cmpu b32 $r4 0xc0
bra nc #dtable_cmd
mov $r5 #common_cmd_dtable - 0x80 * 8
mov $r6 #common_cmd_max
cmpu b32 $r4 0x80
bra nc #dtable_cmd
cmpu b32 $r4 0x60
bra nc #dma_cmd
cmpu b32 $r4 0x50
bra ne #illegal_mthd
// mthd 0x140: PM_TRIGGER
mov $r2 0x2200
clear b32 $r3
sethi $r3 0x20000
iowr I[$r2] $r3
bra #cmddone
dma_cmd:
// mthd 0x180...: DMA_*
cmpu b32 $r4 0x60+#dma_count
bra nc #illegal_mthd
shl b32 $r5 $r4 2
add b32 $r5 (#ctx_dma - 0x60 * 4) & 0xffff
bset $r3 0x1e
st b32 D[$r5] $r3
add b32 $r4 0x180 - 0x60
shl b32 $r4 8
iowr I[$r4] $r3
bra #cmddone
dtable_cmd:
cmpu b32 $r4 $r6
bra nc #illegal_mthd
shl b32 $r4 3
add b32 $r4 $r5
ld b32 $r5 D[$r4 + 4]
and $r5 $r3
cmpu b32 $r5 0
bra ne #invalid_bitfield
ld b16 $r5 D[$r4]
ld b16 $r6 D[$r4 + 2]
cmpu b32 $r6 2
bra e #cmd_setctx
ld b32 $r7 D[$r0 + #ctx_cond_off]
and $r6 $r7
cmpu b32 $r6 1
bra e #cmddone
call $r5
bra $p1 #dispatch_error
bra #cmddone
cmd_setctx:
st b32 D[$r5] $r3
bra #cmddone
invalid_bitfield:
or $r2 1
dispatch_error:
illegal_mthd:
mov $r4 0x1000
iowr I[$r4] $r2
iowr I[$r4 + 0x100] $r3
mov $r4 0x40
iowr I[$r0] $r4
im_loop:
iord $r4 I[$r0 + 0x200]
and $r4 0x40
cmpu b32 $r4 0
bra ne #im_loop
cmddone:
// remove the command from FIFO
mov $r3 0x1d00
mov $r4 1
iowr I[$r3] $r4
nocmd:
// ack the processed interrupts
and $r1 $r1 0xc
iowr I[$r0 + 0x100] $r1
iret
cmd_query_get:
// if bit 0 of param set, trigger interrupt afterwards.
setp $p1 $r3
or $r2 3
// read PTIMER, beware of races...
mov $r4 0xb00
ptimer_retry:
iord $r6 I[$r4 + 0x100]
iord $r5 I[$r4]
iord $r7 I[$r4 + 0x100]
cmpu b32 $r6 $r7
bra ne #ptimer_retry
// prepare the query structure
ld b32 $r4 D[$r0 + #ctx_query_counter]
st b32 D[$r0 + #swap + 0x0] $r4
st b32 D[$r0 + #swap + 0x4] $r0
st b32 D[$r0 + #swap + 0x8] $r5
st b32 D[$r0 + #swap + 0xc] $r6
// will use target 0, DMA_QUERY.
mov $xtargets $r0
ld b32 $r4 D[$r0 + #ctx_query_address_high]
shl b32 $r4 0x18
mov $xdbase $r4
ld b32 $r4 D[$r0 + #ctx_query_address_low]
mov $r5 #swap
sethi $r5 0x20000
xdst $r4 $r5
xdwait
ret
cmd_cond_mode:
// if >= 5, INVALID_ENUM
bset $flags $p1
or $r2 2
cmpu b32 $r3 5
bra nc #return
// otherwise, no error.
bclr $flags $p1
// if < 2, no QUERY object is involved
cmpu b32 $r3 2
bra nc #cmd_cond_mode_queryful
xor $r3 1
st b32 D[$r0 + #ctx_cond_off] $r3
return:
ret
cmd_cond_mode_queryful:
// ok, will need to pull a QUERY object, prepare offsets
ld b32 $r4 D[$r0 + #ctx_cond_address_high]
ld b32 $r5 D[$r0 + #ctx_cond_address_low]
and $r6 $r5 0xff
shr b32 $r5 8
shl b32 $r4 0x18
or $r4 $r5
mov $xdbase $r4
mov $xtargets $r0
// pull the first one
mov $r5 #swap
sethi $r5 0x20000
xdld $r6 $r5
// if == 2, only a single QUERY is involved...
cmpu b32 $r3 2
bra ne #cmd_cond_mode_double
xdwait
ld b32 $r4 D[$r0 + #swap + 4]
cmpu b32 $r4 0
xbit $r4 $flags z
st b32 D[$r0 + #ctx_cond_off] $r4
ret
// ok, we'll need to pull second one too
cmd_cond_mode_double:
add b32 $r6 0x10
add b32 $r5 0x10
xdld $r6 $r5
xdwait
// compare COUNTERs
ld b32 $r5 D[$r0 + #swap + 0x00]
ld b32 $r6 D[$r0 + #swap + 0x10]
cmpu b32 $r5 $r6
xbit $r4 $flags z
// compare RESen
ld b32 $r5 D[$r0 + #swap + 0x04]
ld b32 $r6 D[$r0 + #swap + 0x14]
cmpu b32 $r5 $r6
xbit $r5 $flags z
and $r4 $r5
// and negate or not, depending on mode
cmpu b32 $r3 3
xbit $r5 $flags z
xor $r4 $r5
st b32 D[$r0 + #ctx_cond_off] $r4
ret
cmd_wrcache_flush:
bclr $flags $p1
mov $r2 0x2200
clear b32 $r3
sethi $r3 0x10000
iowr I[$r2] $r3
ret
crypt_cmd_mode:
// if >= 0xf, INVALID_ENUM
bset $flags $p1
or $r2 2
cmpu b32 $r3 0xf
bra nc #crypt_cmd_mode_return
bclr $flags $p1
st b32 D[$r0 + #ctx_mode] $r3
crypt_cmd_mode_return:
ret
crypt_cmd_length:
// nop if length == 0
cmpu b32 $r3 0
bra e #crypt_cmd_mode_return
// init key, IV
cxset 3
mov $r4 #ctx_key
sethi $r4 0x70000
xdst $r0 $r4
mov $r4 #ctx_iv
sethi $r4 0x60000
xdst $r0 $r4
xdwait
ckeyreg $c7
// prepare the targets
mov $r4 0x2100
mov $xtargets $r4
// prepare src address
ld b32 $r4 D[$r0 + #ctx_src_address_high]
ld b32 $r5 D[$r0 + #ctx_src_address_low]
shr b32 $r8 $r5 8
shl b32 $r4 0x18
or $r4 $r8
and $r5 $r5 0xff
// prepare dst address
ld b32 $r6 D[$r0 + #ctx_dst_address_high]
ld b32 $r7 D[$r0 + #ctx_dst_address_low]
shr b32 $r8 $r7 8
shl b32 $r6 0x18
or $r6 $r8
and $r7 $r7 0xff
// find the proper prep & do functions
ld b32 $r8 D[$r0 + #ctx_mode]
shl b32 $r8 2
// run prep
ld b16 $r9 D[$r8 + #crypt_dtable]
call $r9
// do it
ld b16 $r9 D[$r8 + #crypt_dtable + 2]
call $r9
cxset 1
xdwait
cxset 0x61
xdwait
xdwait
// update src address
shr b32 $r8 $r4 0x18
shl b32 $r9 $r4 8
add b32 $r9 $r5
adc b32 $r8 0
st b32 D[$r0 + #ctx_src_address_high] $r8
st b32 D[$r0 + #ctx_src_address_low] $r9
// update dst address
shr b32 $r8 $r6 0x18
shl b32 $r9 $r6 8
add b32 $r9 $r7
adc b32 $r8 0
st b32 D[$r0 + #ctx_dst_address_high] $r8
st b32 D[$r0 + #ctx_dst_address_low] $r9
// pull updated IV
cxset 2
mov $r4 #ctx_iv
sethi $r4 0x60000
xdld $r0 $r4
xdwait
ret
crypt_copy_prep:
cs0begin 2
cxsin $c0
cxsout $c0
ret
crypt_store_prep:
cs0begin 1
cxsout $c6
ret
crypt_ecb_e_prep:
cs0begin 3
cxsin $c0
cenc $c0 $c0
cxsout $c0
ret
crypt_ecb_d_prep:
ckexp $c7 $c7
cs0begin 3
cxsin $c0
cdec $c0 $c0
cxsout $c0
ret
crypt_cbc_e_prep:
cs0begin 4
cxsin $c0
cxor $c6 $c0
cenc $c6 $c6
cxsout $c6
ret
crypt_cbc_d_prep:
ckexp $c7 $c7
cs0begin 5
cmov $c2 $c6
cxsin $c6
cdec $c0 $c6
cxor $c0 $c2
cxsout $c0
ret
crypt_pcbc_e_prep:
cs0begin 5
cxsin $c0
cxor $c6 $c0
cenc $c6 $c6
cxsout $c6
cxor $c6 $c0
ret
crypt_pcbc_d_prep:
ckexp $c7 $c7
cs0begin 5
cxsin $c0
cdec $c1 $c0
cxor $c6 $c1
cxsout $c6
cxor $c6 $c0
ret
crypt_cfb_e_prep:
cs0begin 4
cenc $c6 $c6
cxsin $c0
cxor $c6 $c0
cxsout $c6
ret
crypt_cfb_d_prep:
cs0begin 4
cenc $c0 $c6
cxsin $c6
cxor $c0 $c6
cxsout $c0
ret
crypt_ofb_prep:
cs0begin 4
cenc $c6 $c6
cxsin $c0
cxor $c0 $c6
cxsout $c0
ret
crypt_ctr_prep:
cs0begin 5
cenc $c1 $c6
cadd $c6 1
cxsin $c0
cxor $c0 $c1
cxsout $c0
ret
crypt_cbc_mac_prep:
cs0begin 3
cxsin $c0
cxor $c6 $c0
cenc $c6 $c6
ret
crypt_cmac_finish_complete_prep:
cs0begin 7
cxsin $c0
cxor $c6 $c0
cxor $c0 $c0
cenc $c0 $c0
cprecmac $c0 $c0
cxor $c6 $c0
cenc $c6 $c6
ret
crypt_cmac_finish_partial_prep:
cs0begin 8
cxsin $c0
cxor $c6 $c0
cxor $c0 $c0
cenc $c0 $c0
cprecmac $c0 $c0
cprecmac $c0 $c0
cxor $c6 $c0
cenc $c6 $c6
ret
// TODO
crypt_do_in:
add b32 $r3 $r5
mov $xdbase $r4
mov $r9 #swap
sethi $r9 0x20000
crypt_do_in_loop:
xdld $r5 $r9
xdwait
cxset 0x22
xdst $r0 $r9
cs0exec 1
xdwait
add b32 $r5 0x10
cmpu b32 $r5 $r3
bra ne #crypt_do_in_loop
cxset 1
xdwait
ret
crypt_do_out:
add b32 $r3 $r7
mov $xdbase $r6
mov $r9 #swap
sethi $r9 0x20000
crypt_do_out_loop:
cs0exec 1
cxset 0x61
xdld $r7 $r9
xdst $r7 $r9
cxset 1
xdwait
add b32 $r7 0x10
cmpu b32 $r7 $r3
bra ne #crypt_do_out_loop
ret
crypt_do_inout:
add b32 $r3 $r5
mov $r9 #swap
sethi $r9 0x20000
crypt_do_inout_loop:
mov $xdbase $r4
xdld $r5 $r9
xdwait
cxset 0x21
xdst $r0 $r9
cs0exec 1
cxset 0x61
mov $xdbase $r6
xdld $r7 $r9
xdst $r7 $r9
cxset 1
xdwait
add b32 $r5 0x10
add b32 $r7 0x10
cmpu b32 $r5 $r3
bra ne #crypt_do_inout_loop
ret
.align 0x100