; $Id$:

; turn on/off individual refresh procedures
;     REFR_DIRECT_ADC                load ADC related configuration registers ADCEN, ADCINB, ADCDAC, ADCPAR, ADCTST, ADCDAC could be set individually?
;     REFR_DIRECT_ALL                IRQHW0..3, IRQHL0..3, CPUnCLK (0..3), NITM0, NIP4D, NICLK, FILCLK, PRECLK, NMOD, NCUT
;     REFR_DIRECT_HCM                NIODE, NIOCE, NIIDE (for HCM), ADCEN & ADCMSK = 0
;     REFR_DIRECT_BM_M               NIODE, NIOCE, NIIDE (for non-HCM)
;     REFR_IRQ_VEC                   - CLR, ACQ, RAW start addresses
;     REFR_IRQ_VEC_HCM_BM            /
;     REFR_PRE_PAR                   TPPT0, TPFS, TPPGR, TPPAE, TPQS0, TPQE0, TPQS1, TPQE1, EBD, EBAQA, EBSIA, EBSF, EBSIM, EBIS, EBIT, EBIL, EBIN
;     REFR_PRE_CLUST_PAR             TPFP, TPHT, (TPVT), TPVBY, TPCT, TPCL, TPCBY, TPD
;     REFR_LPTC_FIL                  FLBY, FPBY, FGBY, FTBY, FCBY, FCW0..4, FTAL, FTLL, FTLS, FPTC, FPNP, FPCL
;     REFR_GAIN_FILG                 FGTA, FGTB, FGCL, FGF0..20
;     REFR_GAIN_FILA                 FGA0..20
;     REFR_TPL                       TPL LUT
;     REFR_PATCHM_PAR                SMMODE, NTRO, NRRO, NED, NDLY, NP0..3 stored in DB
;     REFR_UNUSED

; Refresh procedures, short descriptions
;
; MCMs with ADCs (all without BM & HCM)
; load_direct_first_m       (HI)            35      01234567        DMDELA, DMDELS, ARBTIM, MEMCOR, SEBDEN, SEBDOU, SML0..2
; load_patchm_par           (HI)            71+     ----4---        SMMODE, NTRO, NRRO, NED, NDLY, NP0..3 stored in DB
; load_direct_all           (HI)            49      --2----7        IRQHW0..3, IRQHL0..3, CPUnCLK (0..3), NITM0, NIP4D, NICLK, FILCLK, PRECLK, NMOD, NCUT
; load_irq_vec              (HI)            11                      CLR, ACQ, RAW start addresses
;                                            CPU0   --2-3--7
;                                            CPU1   --2--5-7
;                                            CPU2   -1---5-7
;                                            CPU3   01234567

; load_dm_par               (HI)            22      01---5--        SCALE_Y_DM, SCALE_D_DM, SCALE_Q_DM, DEFL_CR_DM
; load_pre_par              (HI)            57      --2-4---        TPPT0, TPFS, TPPGR, TPPAE, TPQS0, TPQE0, TPQS1, TPQE1, EBD, EBAQA, EBSIA, EBSF, EBSIM, EBIS, EBIT, EBIL, EBIN
; load_direct_bm_m          (HI)            11      -1---56-        NIODE, NIOCE, NIIDE (for non-HCM)
; load_pre_clust_par        (LOW)           28      ----4--         TPFP, TPHT, (TPVT), TPVBY, TPCT, TPCL, TPCBY, TPD
; load_direct_adc           (LOW)           20                      ADCEN, ADCINB, ADCDAC, ADCPAR, ADCTST
; load_tpl                  (LOW)          156                      TPL LUT  1/3a
; load_tpl_sec_cpu          (LOW)           77                      TPL_LUT  1/3b
; load_lptc_fil             (LOW)           65                      FLBY, FPBY, FGBY, FTBY, FCBY, FCW0..4, FTAL, FTLL, FTLS, FPTC, FPNP, FPCL
; load_gain_filg            (LOW)          103                      FGTA, FGTB, FGCL, FGF0..20
; load_gain_fila            (LOW)           93                      FGA0..20

; BM & HCM)
; load_direct_first_hcm_bm  (HI)            35                      ARBTIM, MEMCOR, SEBDEN, SEBDOU, SML0..2
; load_patchm_par           (HI)            71+                     SMMODE, NTRO, NRRO, NED, NDLY, NP0..3 stored in DB
; load_direct_all           (HI)            49                      IRQHW0..3, IRQHL0..3, CPUnCLK (0..3), NITM0, NIP4D, NICLK, FILCLK, PRECLK, NMOD, NCUT
; load_irq_vec_hcm_bm       (HI)            11                      CLR, ACQ, RAW start addresses
; load_direct_bm_m  \       (HI)            10                      NIODE, NIOCE, NIIDE (for non-HCM)
; load_direct_hcm   /       (HI)            16                      NIODE, NIOCE, NIIDE (for HCM), ADCEN & ADCMSK = 0
;?? load_pre_par              (HI)            57                      TPPT0, TPFS, TPPGR, TPPAE, TPQS0, TPQE0, TPQS1, TPQE1, EBD, EBAQA, EBSIA, EBSF, EBSIM, EBIS, EBIT, EBIL, EBIN


; more info about config refresh procedures
;
; all of them use r14 as IO pointer, r15 as DMEM pointer (only when needed), r1 for data and r8 (rstack) as return address
; - the TPL refresh needs more registers (r2..r4, g14, g15)
;
; ************************** ALL MCMs ***************************************************
;
; load_patchm_par  (HI)
; refresh the patch parameters: SMMODE, NTRO, NRRO, NED, NDLY, NP0..3 stored in DB in a block of 6 words
; Length: 71 instructions

; load_direct_all  (LOW)
; load several parameters related to all MCMs
; IRQHW0..3, IRQHL0..3, CPUnCLK (0..3), NITM0, NIP4D, NICLK, FILCLK, PRECLK, NMOD, NCUT
; Length: 49 instructions (but definitely more CPU clocks!)


; ************************** NO BM, HCM ***************************************************
;

; load_direct_first_m (HI)
; load very important parameters, preferably before the others
; DMDELA, DMDELS, ARBTIM, MEMCOR, SEBDEN, SEBDOU, SML0..2
; Length: 35 instructions

; load_irq_vec        (HI)
; load the interrupt vectors for the 3 used interrupts (CLR, ACQ, RAW)
; - this must be done by ALL CPUs!
; - the interrupt vectors are independent and if omitting the ORGs, the start
;   addresses for each CPU could be different!
; Length: 11 instructions

; load_dm_par         (HI)
; load 4 tracklet parameters from DMEM needed for the tracklet program
; (ADCMSK_DM), SCALE_Y_DM, SCALE_D_DM, SCALE_Q_DM, DEFL_CR_DM
; - ADCMSK will be refreshed in the normal tracklet program at each run with tracklets!
; Length: 22 instructions

; load_pre_par        (HI)
; preprocessor parameters:
; TPPT0, TPFS, TPPGR, TPPAE, TPQS0, TPQE0, TPQS1, TPQE1,
; EBD, EBAQA, EBSIA, EBSF, EBSIM, EBIS, EBIT, EBIL, EBIN
; Length: 57 instructions

; load_pre_clust_par  (LOW)
; preprocessor
; TPFP, TPHT, (TPVT), TPVBY, TPCT, TPCL, TPCBY, TPD
; Length: 25 instructions (when TPVBY=1), else +3


; load_direct_adc     (LOW)
; load ADC related configuration registers ADCEN, ADCINB, ADCDAC, ADCPAR, ADCTST
; ADCDAC could be set individually?
; Length: 20 instructions

; load_tpl            (LOW)
; Length: 64 instructions, but 64+23*4=156 CPU clocks at least
; load_tpl_sec_cpu
; Length 8+23, but 8+23*3=77 CPU clocks at least (but this CPU hast lower priority on the GIO bus!)
;
; executed by 2 CPUs, the first prepares the start addresses and updates the refresh counter and refresh phase
; the second just refreshes the second part of the refreshed 1/3.
; this takes long, use it in tracklet mode only when all CPUs don't have anything to do!

; load_direct_bm_m    (LOW)
; load some parameters to all without HCM
; Length: 11 instructions

; load_lptc_fil       (LOW)
; - write all bypass flags at once
; - disable the linearity filter
; - configure the pedestal filter
; - disable the cross talk filter properly
; - configure the tail filter
; FLBY, FPBY, FGBY, FTBY, FCBY, FCW0..4, FTAL, FTLL, FTLS, FPTC, FPNP, FPCL
; Length: 45 instructions but at least 45+20 CPU clocks


; load_gain_filg      (LOW)
; - read the gain factors from DMEM (3 x 9 bit in one 32-bit word) x 7 for 21 channels and store them to the registers in FGFn+ch
; FGTA, FGTB, FGCL, FGF0..20
; long (3+21 GIO writes)
; Length: 25 instructions but at least 25+78=103 CPU clocks

; load_gain_fila      (LOW)
; - read the gain additives from DMEM (5 x 6 bit in one 32-bit word) x 4 + 1 and store them to FGAn+ch
; FGA0..20
; long (21 GIO writes)
; Length: 30 instructions but at least 30+21*3=93 CPU clocks


; ***************************** BM and HCM ************************************************
;

; load_direct_first_hcm_bm  (HI)
; load very important parameters, preferably before the others
; ARBTIM, MEMCOR, SEBDEN, SEBDOU, SML0..2
; Length: 27 instructions

; load_irq_vec_hcm_bm       (HI)
; load the interrupt vectors for the 3 used interrupts (CLR, ACQ, RAW)
; Length: 11 instructions

; load_direct_hcm           (LOW)
; load parameters specific to HCM only
; NIODE, NIOCE, NIIDE (for HCM), ADCEN & ADCMSK = 0
; Length: 16 instructions

; === BM only ====
; load_direct_bm_m          (LOW)
; load parameters to all without HCM
; NIODE, NIOCE, NIIDE (for non-HCM)
; Length: 10 instructions

;#ORG-

; load very important parameters, preferably before the others
; DMDELA, DMDELS, HCM and BM start from ARBTIM, MEMCOR, SEBDEN, SEBDOU, SML0..2
; Full Length: 35 instructions
load_direct_first_m:
    ; DMEM timing
    mov DMDELA_VAL, r1
    jmpr cc_busy, 0
    iext DMDELA
    sgio r1, DMDELA

    mov DMDELS_VAL, r1
    jmpr cc_busy, 0
    iext DMDELS
    sgio r1, DMDELS

; HCM and BM skip the DMEM parameters, as DMEM is not used there
; Started from here: Length: 27 instructions
load_direct_first_hcm_bm:
    ; global IO bus timing
    mov ARBTIM_VAL, r1
    jmpr cc_busy, 0
    sgio r1, ARBTIM

    ; hamming correction
    mov MEMCOR_VAL, r1
    jmpr cc_busy, 0
    iext MEMCOR
    sgio r1, MEMCOR

;   SEBDEN, SEBDOU - a change here could reset parts of ROBs immediately! (CM, BM, HCM)
;                    but a single bit flip in only one of them is not dangerous, therefore
;                    regular refresh could help a lot! SEBEN=0, SEBDOUT=b111
    mov 0, r1
    jmpr cc_busy, 0
    iext SEBDEN
    sgio r1, SEBDEN

    mov b111, r1
    jmpr cc_busy, 0
    iext SEBDOU
    sgio r1, SEBDOU

    mova SML0_VAL, r1
    jmpr cc_busy, 0
    sgio r1, SML0

    mova SML1_VAL, r1
    jmpr cc_busy, 0
    sgio r1, SML1

    #ifeq DYN_L1A, 0            ; only in this case refresh the SML2 here, otherwise it will be done in the tracklet program directly
    mova SML2_VAL, r1           ; the value of SML2, with or without Ignore flag set
    jmpr cc_busy, 0
    sgio r1, SML2
    #endif

    #ifeq DYN_L1A, 1            ; only in this case refresh the counter config, otherwise it is not so important
    mova CTGCTRL_VAL, r1
    jmpr cc_busy, 0
    sgio r1, CTGCTRL
    #endif

    ; ready, return
    jmp cc_uncond, rstack


ORG 0xC40
; load 4 tracklet parameters from DMEM needed for the tracklet program
; (ADCMSK_DM), SCALE_Y_DM, SCALE_D_DM, SCALE_Q_DM, DEFL_CR_DM
; - ADCMSK will be refreshed in the normal tracklet program at each run with tracklets!
; Length: 22 instructions

load_dm_par:        ; probably not necessary more, as this will be done in the tracklet program

;   mov  SCALE_Y_DM, r15                ; prepare for reading from DMEM
;   nop
;   lra  rr_dword, r1                   ; read scale_y
;   lra  rr_dword, r1
;   mov  r1, scale_y
;   sra+ r1
;
;   nop
;   lra  rr_dword, r1                   ; read scale_d
;   lra  rr_dword, r1
;   mov  r1, scale_d
;   sra+ r1
;
;   nop
;   lra  rr_dword, r1                   ; read defl_cor
;   lra  rr_dword, r1
;   mov  r1, r1                         ; later defl_cor, if necessary, now dummy
;   sra+ r1
;
;   nop
;   lra  rr_dword, r1                   ; read scale factor for Q
;   lra  rr_dword, r1
;   mov  r1, scale_q
;   sra+ r1
    ; if sra after lra is ok? this is shorter
    mov  SCALE_Y_DM, r15                ; prepare for reading from DMEM
    nop
    lra  rr_dword, r1                   ; read scale_y
    lra  rr_dword, r1
    sra+ r1
    mov  r1, scale_y

    lra  rr_dword, r1                   ; read scale_d
    lra  rr_dword, r1
    sra+ r1
    mov  r1, offs_y

    lra  rr_dword, r1                   ; read defl_cor
    lra  rr_dword, r1
    sra+ r1
    mov  r1, scale_d                         ; later defl_cor, if necessary, now dummy

    #ifdef SCALE_Q_DM
    lra  rr_dword, r1                   ; read scale factor for Q
    lra  rr_dword, r1
    mov  r1, scale_q
    sra+ r1
    #else
    add r15, 4, r15
    #endif

    ; ??? about 50 cpu clocks to here

    ; ready, return
    jmp cc_uncond, rstack

ORG 0xC60

; load ADC related configuration registers ADCEN, ADCINB, ADCDAC, ADCPAR, ADCTST
; ADCDAC could be set individually?
; Length: 20 instructions
load_direct_adc:

#ifdef REFR_DIRECT_ADC
    mov ADCEN_VAL, r1
    jmpr cc_busy, 0
    sgio r1, ADCEN
    ; ADCMSK is at 0x3050 and will be refreshed from DMEM each tracklet run, so here will be omitted!
    ; ADCINB, ADCDAC, ADCPAR, ADCTST are 0x3051..4
    iext ADCINB
    mov ADCINB, r14
    mov ADCINB_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov ADCDAC_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    iext ADCPAR_VAL
    mov ADCPAR_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov 0, r1               ; ADCTST, must remain 00, 01 +VREF, 10 -VREF, 11 - 0
    jmpr cc_busy, 0
    sgio+ r1
#else
    mov 7, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif


    jmpr cc_busy, 0

    ; ready, return
    jmp cc_uncond, rstack

ORG 0xC80

; load several parameters related to all MCMs
; IRQHW0..3, IRQHL0..3, CPUnCLK (0..3), NITM0, NIP4D, NICLK, FILCLK, PRECLK, NMOD, NCUT
; Length: 49 instructions (but definitely more CPU clocks!)
load_direct_all:

#ifdef REFR_DIRECT_ALL
    ; interrupt masks
    mov IRQHW_VAL, r1
    jmpr cc_busy, 0
    sgio r1, IRQHW0
    jmpr cc_busy, 0
    sgio r1, IRQHW1
    jmpr cc_busy, 0
    sgio r1, IRQHW2
    jmpr cc_busy, 0
    sgio r1, IRQHW3

    mov IRQHL_VAL, r1
    jmpr cc_busy, 0
    sgio r1, IRQHL0
    jmpr cc_busy, 0
    sgio r1, IRQHL1
    jmpr cc_busy, 0
    sgio r1, IRQHL2
    jmpr cc_busy, 0
    sgio r1, IRQHL3

    mov CPUxCLK_VAL, r1
    jmpr cc_busy, 0
    sgio r1, CPU0CLK
    jmpr cc_busy, 0
    sgio r1, CPU1CLK
    jmpr cc_busy, 0
    sgio r1, CPU2CLK
    jmpr cc_busy, 0
    sgio r1, CPU3CLK

    mova NITM0_VAL, r1    ; can be up to 14-bit, but is much smaller, = 12*nsamples+50
    jmpr cc_busy, 0
    sgio r1, NITM0
    ; about 90 cpu clocks to here

    mov NIP4D_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIP4D

    mov NICLK_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NICLK

    mov FILCLK_VAL, r1
    jmpr cc_busy, 0
    sgio r1, FILCLK

    mov PRECLK_VAL, r1
    jmpr cc_busy, 0
    sgio r1, PRECLK

    mov NMOD_RST_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NMOD

    #ifeq NCUT_SINGLE_P_VAL, 0xFF
    mov c7, r1
    #else                           ; combine 4 identical bytes
    mov NCUT_SINGLE_P_VAL, r1
    sll 8, r1, r14
    or r1, r14, r1
    swp r1, r14
    or r1, r14, r1
    #endif
    jmpr cc_busy, 0
    sgio r1, NCUT
#else
    mov 20, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    ; ready, return
    jmp cc_uncond, rstack

ORG 0xCE0

; load parameters specific to HCM only
; NIODE, NIOCE, NIIDE (for HCM), ADCEN & ADCMSK = 0
; Length: 16 instructions
load_direct_hcm:
#ifdef REFR_DIRECT_HCM
    mov NIODE_HCM_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIODE

    mov NIOCE_HCM_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIOCE

    mov NIIDE_HCM_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIIDE

    mov 0, r1
    jmpr cc_busy, 0
    sgio r1, ADCEN

    jmpr cc_busy, 0
    iext ADCMSK
    sgio r1, ADCMSK
#else
    mov 6, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    jmp cc_uncond, rstack

ORG 0xD00
; this is specific to BM and normal MCMs!!!
; NIODE, NIOCE, NIIDE (for non-HCM)
; Length: 10 instructions
load_direct_bm_m:
; for all MCMs except for the HCM
#ifdef REFR_DIRECT_BM_M
    mov NIODE_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIODE

    mov NIOCE_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIOCE

    mov NIIDE_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIIDE
#else
    mov 4, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    jmp cc_uncond, rstack

ORG 0xD20
; load the interrupt vectors for the 3 used interrupts (CLR, ACQ, RAW)
; - this must be done by ALL CPUs!
; - the interrupt vectors are independent and if omitting the ORGs, the start
;   addresses for each CPU could be different!
; Length: 11 instructions
load_irq_vec:
#ifdef REFR_IRQ_VEC
    nop
#else
    mov 4, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
    jmp cc_uncond, rstack
#endif

init_irq_vec:
    ; modifies r1, uses rstack (r8) as return address
    mov clr, r1
    jmpr cc_busy, 0
    sgio r1, IA_CLR;

    mov raw, r1
    jmpr cc_busy, 0
    sgio r1, IA_RAW;

    mov acq, r1
    jmpr cc_busy, 0
    sgio r1, IA_ACQ;

    jmp cc_uncond, rstack

ORG 0xD40
; load the interrupt vectors for the 3 used interrupts (CLR, ACQ, RAW)
; Length: 11 instructions
load_irq_vec_hcm_bm:
#ifdef REFR_IRQ_VEC
    nop
#else
    mov 4, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
    jmp cc_uncond, rstack
#endif
init_irq_vec_hcm_bm:
    ; modifies r1, uses rstack (r8) as return address
    mov clr, r1
    jmpr cc_busy, 0
    sgio r1, IA_CLR;

    mov raw, r1
    jmpr cc_busy, 0
    sgio r1, IA_RAW;

    mov acq_hcm_bm, r1
    jmpr cc_busy, 0
    sgio r1, IA_ACQ;

    jmp cc_uncond, rstack

ORG 0xD60

; preprocessor parameters:
; TPPT0, TPFS, TPPGR, TPPAE, TPQS0, TPQE0, TPQS1, TPQE1,
; EBD, EBAQA, EBSIA, EBSF, EBSIM, EBIS, EBIT, EBIL, EBIN
; Length: 57 instructions
load_pre_par:
#ifdef REFR_PRE_PAR
    iext TPPT0                  ; use that TPPT0, TPFS, TPFE, TPPGR, TPPAE, TPQS0, TPQE0, TPQS1, TPQE1
    mov  TPPT0, rio             ; are on incremental addresses
    mov TPPT0_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPFS_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    add rio, c1, rio            ; skip the next address, this is TPFE and will be refreshed in the fit program
    mov TPPGR_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPPAE_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPQS0_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPQE0_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPQS1_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPQE1_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBD_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBAQA_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBSIA_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBSF_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBSIM_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    ; skip some addresses...

    iext EBIS
    mov EBIS, rio
    mov EBIS_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBIT_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBIL_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov EBIN_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1
#else
    mov 21, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

; more preprocessor parameters exist!

    jmp cc_uncond, rstack

ORG 0xDA0

; preprocessor
; TPFP, TPHT, (TPVT), TPVBY, TPCT, TPCL, TPCBY, TPD
; Length: 25 instructions (when TPVBY=1), else +3
load_pre_clust_par:
#ifdef REFR_PRE_CLUST_PAR
    iext TPFP
    mov TPFP, rio

    mov TPFP_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mova TPHT_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    #ifeq TPVBY_VAL, 1          ; not bypassed
    mov TPVT_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1
    #else
    add rio, c1, rio
    #endif

    mov TPVBY_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPCT_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPCL_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPCBY_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov TPD_RST_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1
#else
    mov 11, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    jmp cc_uncond, rstack

ORG 0xDC0
load_lptc_fil:
; - write all bypass flags at once
; - disable the linearity filter
; - configure the pedestal filter
; - disable the cross talk filter properly
; - configure the tail filter
; FLBY, FPBY, FGBY, FTBY, FCBY, FCW0..4, FTAL, FTLL, FTLS, FPTC, FPNP, FPCL
; Length: 45 instructions but at least 45+20 CPU clocks

    ; first the bypass register, 1 bit long. The order is FL, FP, FG, FT, FC
#ifdef REFR_LPTC_FIL
    iext FLBY               ; init the address pointer
    mov FLBY, rio
                            ; now write the 5 bypass registers
    mov FLBY_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov FPBY_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov FGBY_VAL, r1        ; gain bypass refreshed here, but the parameters in a separate procedure
    jmpr cc_busy, 0
    sgio+ r1

    mov FTBY_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov FCBY_VAL, r1        ; disabled, but not bypassed, as used for 2 pipleline stages
    jmpr cc_busy, 0
    sgio+ r1

    ; actually it will be not used, here only this case covered now!
    iext FCWn
    mov FCWn, rio       ; the coefficients from 0 to 4, all are 0
    mov 0, r1
    mov 4, r2           ; used as counter

    jmpr cc_busy, 0
    sgio+ r1
    sub r2, c1, r2
    jmpr -3, cc_ncarry

    ; now the tail cancellation filter
    #ifeq FTBY_VAL, 1       ; only when enabled refresh the other 3 parameters

    iext FTAL
    mov FTAL, rio

    mov FTAL_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov FTLL_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov FTLS_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    #endif

    ; now the pedestal filter
    #ifeq FPBY_VAL, 1       ; only when enabled refresh the other 2 parameters

    iext FPTC
    mov FPTC, rio

    mov FPTC_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov FPNP_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov 1, r1               ; FPCL is clear for the pedestal filter, it must stay at 1
    jmpr cc_busy, 0
    sgio+ r1

    #endif
#else
    mov 23, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    jmp cc_uncond, rstack

ORG 0xE00
; - read the gain factors from DMEM (3 x 9 bit in one 32-bit word) x 7 for 21 channels and store them to the registers in FGFn+ch
; FGTA, FGTB, FGCL, FGF0..20
; long (3+21 GIO writes)
; Length: 25 instructions but at least 25+78=103 CPU clocks
load_gain_filg:
; modifies r1, r2, r14, r15
    #ifeq FGBY_VAL, 1       ; only when enabled refresh the other 2 parameters
    #ifdef REFR_GAIN_FILG
    iext FGTA
    mov FGTA, rio

    mov FGTA_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov FGTB_VAL, r1
    jmpr cc_busy, 0
    sgio+ r1

    mov 1, r1                       ; FGCL is a clear and must be 1
    jmpr cc_busy, 0
    sgio+ r1

    mov GAIN_TABLE_MULT_DM, r15     ; DMEM pointer to the beginning of the table
    mov 7, r2                       ; loop counter, we need to unpack 7 32-bit words
    iext FGFn
    mov FGFn, rio                   ; start address of the gain table factors

_load_gain_filg_lp:
    lra4 r1
    lra4+ r1

    jmpr cc_busy, 0
    sgio+ r1
    slr 9, r1, r1
    jmpr cc_busy, 0
    sgio+ r1
    slr 9, r1, r1
    jmpr cc_busy, 0
    sgio+ r1
    sub r2, c1, r2
    jmp cc_nzero, _load_gain_filg_lp
#else
    mov 35, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    #endif
    jmp cc_uncond, rstack

ORG 0xE20
; - read the gain additives from DMEM (5 x 6 bit in one 32-bit word) x 4 + 1 and store them to FGAn+ch
; FGA0..20 (21 GIO writes)
; Length: 30 instructions but at least 30+21*3=93 CPU clocks
load_gain_fila:
; modifies r1, r2, r14, r15
    #ifeq FGBY_VAL, 1       ; only when enabled refresh the other 2 parameters
    #ifdef REFR_GAIN_FILA

    mov GAIN_TABLE_ADDT_DM, r15     ; DMEM pointer to the beginning of the table
    mov 4, r2                       ; loop counter, we need to unpack 5 32-bit words
    iext FGAn                       ; but in the loop only 4 words
    mov FGAn, rio                   ; start address of the gain table additives

_load_gain_fila_lp:
    lra4  r1
    lra4+ r1

    jmpr cc_busy, 0
    sgio+ r1                        ; bits 5..0

    slr 6, r1, r1
    jmpr cc_busy, 0
    sgio+ r1                        ; bits 11..06

    slr 6, r1, r1
    jmpr cc_busy, 0
    sgio+ r1                        ; bits 17..12

    slr 6, r1, r1
    jmpr cc_busy, 0
    sgio+ r1                        ; bits 23..18

    slr 6, r1, r1
    jmpr cc_busy, 0
    sgio+ r1                        ; bits 29..24

    sub r2, c1, r2
    jmp cc_nzero, _load_gain_fila_lp
    ; up to now 4 32-bit words with 5 x 6-bit values -> 20 channels
    ; here the last channel
    lra4  r1
    lra4+ r1

    jmpr cc_busy, 0
    sgio+ r1
#else
    mov 33, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    #endif
    jmp cc_uncond, rstack


ORG 0xE40
; uses r14, r15 as address pointers, r1..4 for data, r8 (rstack) as return address
; this functions must be executed 3 times to refresh the complete LUT.
; Length: 64 instructions, but 64+23*4=156 CPU clocks at least

; executed by 2 CPUs, the first prepares the start addresses and updates the refresh counter and refresh phase
; the second just refreshes the second part of the refreshed 1/3.

; this takes long, use it in tracklet mode only when all CPUs don't have anything to do!

; here follows the code started in the first CPU, with higher priority on the GIO bus
load_tpl:
    ; initialise the loop with the addresses in DMEM (source) and GIO (target)
    ; the DMEM will be read in 32-bit words, each word has 6 x 5 bit LUT data
    ; LUT32[ 0]=LUT5[0] | LUT5[1] << 5 | LUT5[2] << 15 | LUT5[3] << 20 | LUT5[4] << 25 | LUT5[5] << 30
    ; ...
    ; LUT32[21]=LUT5[126] | LUT5[127] << 5 | flags << 12 | counter << 16
    ; where flags is a 2-bit counter, 0 - init state, 1, 2, 3 - first, second, third/3 refreshed
    ; once read, 6 times write to GIO 21 times and the last time write only twice to GIO! 128 = 6*21 + 2
#ifdef REFR_TPL

    mov POS_LUT_TBL_DM_E, r15           ; the last address in DMEM
    iext TPLm2
    mov TPLm2, r14                  ; the address in GIO

    lra  rr_dword, r1
    lra  rr_dword, r1               ; read the last word

    jmpr cc_busy, 0
    sgio+ r1                        ; refresh the before last LUT[126]
    slr 16, r1, r3                  ; the refresh counter
    slr 12, r1, r2                  ; get the flags
    and r2, c3, r2                  ; 0 - init state, 1, 2, 3 - first, second, third/3 refreshed
    add r2, c1, r2                  ; inc the state
    slr 3, r2                       ; mov bit 2 to carry
    adc r2, c0, r2                  ; if the result was 100 => 101, else no change
    and r2, c3, r2                  ; clear bit 2 if set, here is the new flag: 1, 2 or 3
    slr 5, r1, r4                   ; prepare the next 5 bits for writing in r4
    jmpr cc_busy, 0
    sgio+ r4                        ; write the last entry in the LUT

    mov 0x3FF, r4                   ; 10 bit mask
    and r1, r4, r1                  ; the lower 10 bits, the last two LUT entries

    add r3, c1, r3                  ; inc the refresh counter in r3
    sll 4, r3, r3                   ; cnt << 4
    or r3, r2, r3                   ; flags are in r2, (cnt << 4) | flags
    sll 12, r3, r3                  ; (cnt << 16) | flags << 12
    or r3, r1, r1                   ; the full last 32-bit word, with incremented refresh & state counters
    sra+ r1                         ; update the last 32-bit word in the compressed LUT
    ; now do the refresh in one of the 3 blocks stored in 7x32-bit words
    ; r2 is 1, 2 or 3

    mov 16, r4                      ; counter for the 32-bit words, needed in the loop, the result of the mus32 is ready one clock later
    iext TPL
    mov TPL, r14                    ; the address in GIO of the table
    mov POS_LUT_TBL_DM, r15         ; the address in DMEM of the first 1/3
    sub r2, c1, r2                  ; 0..2
    jmp cc_zero, _load_tpl_begin
    mov 42, r3                      ; 7*6 - increment in the GIO address  \ for 1/3 of the table
    mov 28, r7                      ; 7*4 - increment in the DMEM address /
    add r14, r3, r14                ; + 6*7, 7 dwords per 1/3 of the table, 6 LUT entries
    add r15, r7, r15                ; + 4*7, 7 dwords ... byte address
    sub r2, c1, r2
    jmp cc_zero, _load_tpl_begin
    add r14, r3, r14                ; + 6*7
    add r15, r7, r15                ; + 4*7

_load_tpl_begin:
    ; here r4 is contains 4xnumber of words
    ; r14 - start address in GIO
    ; r15 - start address in DMEM
    mov r14, g14
    mov r15, g15
    ; the other CPU should add 6*4 to g14 and use it as r14, as 4 dwords will be skipped
    ; the other CPU should add 4*4 to g15 and use it as r15, ...
    ; the counter r4 should be load with 3*4, as only 3 dwords will be read

_load_tpl_loop:
    lra  rr_dword, r1
    lra  rr_dword, r1

    jmpr cc_busy, 0
    sgio+ r1                        ; store the lowest 5 bits first, don't need to mask the upper

    sra+  r1                        ; store back to DMEM

    slr 5, r1, r1                   ; now prepare bits 9..5 at 4..0
    jmpr cc_busy, 0
    sgio+ r1                        ; store the lowest 5 bits, don't need to mask the upper

    slr 5, r1, r1                   ; now prepare bits 14..10 at 4..0
    jmpr cc_busy, 0
    sgio+ r1                        ; store the lowest 5 bits, don't need to mask the upper

    slr 5, r1, r1                   ; now prepare bits 19..15 at 4..0
    jmpr cc_busy, 0
    sgio+ r1                        ; store the lowest 5 bits, don't need to mask the upper

    slr 5, r1, r1                   ; now prepare bits 24..20 at 4..0
    jmpr cc_busy, 0
    sgio+ r1                        ; store the lowest 5 bits, don't need to mask the upper

    slr 5, r1, r1                   ; now prepare bits 29..25 at 4..0
    jmpr cc_busy, 0
    sgio+ r1                        ; store the lowest 5 bits, don't need to mask the upper

    sub r4, c4, r4                  ; the loop counter 4* (7, 6, 5, 4, 3, 2, 1)
    jmp cc_nzero, _load_tpl_loop    ; loop again

#else
    mov 43, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    jmp cc_uncond, rstack           ; exit
; end

ORG 0xE90
; Length 8+23, but 8+23*3=77 CPU clocks at least (but this CPU hast lower priority on the GIO bus!)
load_tpl_sec_cpu:
; this CPU must have higher # in order to have lower priority on the global I/O bus!
;   sem b1100_0000_0000_0000        ; wait for g14, g15, this is done in the calling program,
                                    ; then some other tasks can be executed to use the time
#ifdef REFR_TPL
    mov 24, r14                     ; GIO offset, 4 words x 6 LUT entries
    mov 16, r15                     ; DMEM offset, 4 words x 4 bytes
    mov 12, r4                      ; loop counter 3 words x 4 bytes
    syn
    add r15, g15, r15
    add r14, g14, r14
    jmp cc_uncond, _load_tpl_loop
#else
    mov 43, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero

    jmp cc_uncond, rstack           ; exit
#endif


; patch maker parameters:
; (number of bits)
; 1. Very critical:
;   SMMODE (16)  - a change here could stop complete branches in ROB immediately! (CM, BM, HCM)
;                   => store it in DBANK and refresh from there!

; 2. Critical:
;   NTRO (18), NRRO (18), NED (16)  - a change here could lead to a crash in the readout tree

; 3. Not critical:
;   ADCMSK (21)  - a change here will eventually enable noise ADC channels or will disable good channels
;   ADCDAC (5)

ORG 0xEA0
; this is a slow function, as it reads and writes to the GIO
; these parameters can not be stored to DMEM, as they are used in HCM and BM as well
; Length: 71 instructions
load_patchm_par:

#ifdef REFR_PATCHM_PAR
    iext SMMODE_DB
    mov SMMODE_DB, r14
    ; update the parameters SMMODE, NTRO, NRRO, NED, NDLY, NP0..3 stored in DB in a block of 6 words
    jmpr cc_busy, 0
    lgio+ 0
    jmpr cc_busy, 0
    lpio GBUSR0, r1
    sgio r1, SMMODE;

    jmpr cc_busy, 0
    lgio+ 0
    jmpr cc_busy, 0
    lpio GBUSR0, r1
    sgio r1, NTRO;

    jmpr cc_busy, 0
    lgio+ 0
    jmpr cc_busy, 0
    lpio GBUSR0, r1
    sgio r1, NRRO;

    jmpr cc_busy, 0
    lgio+ 0
    jmpr cc_busy, 0
    lpio GBUSR0, r1
    sgio r1, NED;

    jmpr cc_busy, 0
    lgio+ 0
    jmpr cc_busy, 0
    lpio GBUSR0, r1
    sgio r1, NDLY;

    ; in the next 32-bit are stored all parity and spare bit position of the 4 input ports (bit 10..3 of NP0..3), the bits 2..0 are always 100b
    jmpr cc_busy, 0
    lgio+ 0
    jmpr cc_busy, 0
    lpio GBUSR0, r1
    sll 3, r1, r14                  ; we have 3 bits on the right side
    or r14, c4, r14                 ; set bit 2, bits 2..0 are 100
    sgio r14, NP0;                  ; store the NP0

    slr 8, r1, r1                   ; now we have the par & spare of port 1 in bits 7..0
    sll 3, r1, r14
    or r14, c4, r14
    jmpr cc_busy, 0
    sgio r14, NP1;

    slr 8, r1, r1                   ; now we have the par & spare of port 2 in bits 7..0
    sll 3, r1, r14
    or r14, c4, r14
    jmpr cc_busy, 0
    sgio r14, NP2;

    slr 8, r1, r1                   ; now we have the par & spare of port 3 in bits 7..0
    sll 3, r1, r14
    or r14, c4, r14
    jmpr cc_busy, 0
    sgio r14, NP3;
    ; here r14 is already modified!
#else
    mov 25, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    jmp cc_uncond, rstack           ; exit

ORG 0xEE0
load_unused:
#ifdef REFR_UNUSED
    iext NBND_VAL
    mov NBND_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NBND

    mova NITM1_VAL, r1      ; the reset value is 14 bit with 1's
    jmpr cc_busy, 0
    sgio r1, NITM1

    #ifneq NITM1_VAL, NITM2_VAL ; the reset values are equal
    mova NITM2_VAL, r1
    #endif
    jmpr cc_busy, 0
    sgio r1, NITM2

    ;mov MEMRW_RST_VAL, r1

#else
    mov 4, r1
    sub r1, 1, r1
    jmpr -2, cc_nzero
#endif

    jmp cc_uncond, rstack           ; exit