; $Id$:

  ; MCM header in tracklet mode, from MSBit left (bit 31) to the LSBit
  ; < padrow (4 bits) | column (2 bits) | 1 | EPID2 (8 bit) | EPID1 (8 bit) | EPID0 (8 bit) | 1 >
  ; EPIDx comes from CPUx, if no tracklet from CPUx, then EPIDx is 0
  ; followed by 0 to 3 dwords, depending on the number of tracklets, in order from CPU0 to CPU2
  ; the header can be send optionally even when no tracklets are there (configuration bit)
  ;
  ; tracklet format, from MSBit left (bit 31) to the LSBit (0)
  ; < pad_position (11 bit) | slope (8 bit) | PID (12 bit) | 1 >

acq:
    ; write something large in TPFE (Preprocessor's Linear Fit End)
#ifdef cpu0
    ; writing 0x7F to TPFE will immitate not finished preprocessing time
    ; cpu2 will restore two CPU clocks later the correct TPFE and so F0..7 will be latched again
    ; but probably this will take about 15 CPU clocks???
    mov  0x7F, r3                       ; the larges possible value
    iext TPFE
    sgio r3, TPFE                       ; load TPFE
#endif

#ifdef cpu1
    mov MEMCOR_VAL, r1
    iext MEMCOR
    sgio r1, MEMCOR
#endif

#ifdef cpu2
    mov  TPFE_VAL, r3                   ; load the normal value
    iext TPFE
    sgio r3, TPFE                       ; ... and restore it in TPFE
#endif

    ; Note! The upper part of Fit registers F8..15 are accessed directly, the lower part F0..F7 are first latched at the end of the drift time.
    ; If this is done too early, they might be invalid! To correct for this, the latch signal is activated again by writing a much longer end of Fit time in TPFE.
    ; There is a delay in the preprocessor, so that now the latch signal will be activated again between clock 20 and 28 after wake up.
    ; In this time even the F8..F15 will be invalid! After this time, all F0..F15 will be valid.
    ; So: don't use now F0..F7, as they might be not valid still!
    ; Even F8..F15 might be instabil between instructions 20 and 28 after wake up!

#ifdef cpu3
    mov WDOG_DM, r15
    lgio 0, ADCMSK_DB                   ; read the ADCMSK from DBANK
    lra4 r13
    lra4 r13
    sub r13, 1, r13
    sra r13, WDOG_DM
    ; if cpu3 can do something better here???
    sem b0000_0000_0000_0111            ; cpu0..2 should write something to g0..2
#else  ; 0..2
    ; copy f8 to r8, as f8 will be soon invalid for 20-30 clocks
    mov f8, r8
            ;##############################################################
            ;#
            ;#      check for tracklets: CHANNEL !=31, same as CHANNEL+1 != 0
            ;#      F0 contains channel, F8 channel+1
            ;##############################################################

    ; As F0 is still may be not ready, we read F8, which should be F0+1. When no tracklet present, F8 is 0
    ; here we are at instruction 7 after wake up, F8 is correct!
    sub r8, 1, adc_ch_msk             ; F8=F0+1, therefore F8-1 is the basis channel of the tracklet! F8 is 0 when F0 is 31 (no tracklet)
    jmp cc_carry, _acq_no_tr_frf_dtr  ; carry means F8 was 0 - no tracklet! jump if no tracklet to precess, in this case adc_ch_msk will be loaded with another constant anyway
#endif

    ; up to here all 4 CPUs run synchronously, now:
    ; cpu0 don't need to check for double tracklets
    ; cpu1 has to check if the ch# is neighbour to that of cpu0
    ; cpu2 has to check if the ch# is neighbour to that of cpu1
    ; cpu3 will not calculate tracklets more

    ; at _acq_no_tr_frf_dtr  the  adc_ch_msk will be loaded with CH_NR_NO_TRCK=23, then the cpu jumps back. 31 is not so good as value, see later
    ; don't use F0..F7, as they might be not valid still! Expected to be valid at instruction 28 after wake up!
    ; even F8..F15 might be instabil between instruction 20 and 28 after wake up!
    ; F8 was copied before to r8, so use it here
#ifdef cpu1                             ; check for double tracklets, about 8 cpu clocks
    sub r8, g4, r0                      ; r0 = f8(CPU1) - f0(CPU0)
    jmp cc_eq, _acq_no_tr_frf_dtr       ; jmp out when the result is 0, f0(CPU1)+1-f0(CPU0)=0 => f0(CPU1)-f0(CPU0)=-1


    cmp r0, 2                           ; compare with 2, f0(CPU1)+1-f0(CPU0)=2 => f0(CPU1)-f0(CPU0)=1
    jmp cc_eq, _acq_no_tr_frf_dtr       ; jmp out when 0
;    sub r8, 1, adc_ch_msk              ; write again to clear the bit in syn mask
#endif

#ifdef cpu2                             ; worst case about 8 cpu clocks
    sub r8, g5, r0                      ; r0 = f8(CPU2) - f0(CPU1)
    jmp cc_eq, _acq_no_tr_frf_dtr       ; jmp out when the result is 0, f0(CPU2)+1-f0(CPU1)=0 => f0(CPU2)-f0(CPU1)=-1
    cmp r0, 2                           ; compare with 2, f0(CPU2)+1-f0(CPU1)=2 => f0(CPU2)-f0(CPU1)=1
    jmp cc_eq, _acq_no_tr_frf_dtr       ; jmp out when 0
;    sub r8, 1, adc_ch_msk              ; write again to clear the bit in syn mask
#endif

#ifdef cpu3                             ; will calculate an adc mask with all channels contributing to tracklets
    mov 0xF0, r1                        ; 1111 0000, mask for four channels at ch4 that should be marked

    #ifeq DYN_L1A, 1
    lgio 1, CTGDOUT                     ; request reading of the global counter - special pulse for full readout
    #endif

    syt r2                              ; or just some nops?
    and r2, 1, r2
    jmpr cc_nzero, -2
                                        ; start with mask for ch4 and 0, because the shift command can not shift from 0 to 18
                                        ; but relative to 4 can shift -4 to +14
                                        ; now the adc_ch_msk_0 is updated already
    mov adc_ch_msk_0, r0                ; channel 0, = 19 for no tracklet, which is -13 as 5 bit signed
    sub r0, 4, r0                       ; for CH_NR_NO_TRCK=23 (no tracklet) we get 19, which is -13 interpreted as 5 bit signed
                                        ; so 1111_0000 will result in 0 after shifted by 13 to the right!
    shl r0, r1, r2                      ; now two higher and one lower channels are marked around the channel selected in f0 of cpu0
                                        ; if no tracklet, the shifted mask will be 0!
                                        ; the shift is within -16..+15, + is left, - is right

    #ifeq DYN_L1A, 1                    ; dynamical control of the full readout
    mova SML2_VALnoA, r5                ; prepare to write the SML2, load it without L1A/R bit
    jmpr cc_busy, 0                     ; wait for the global bus
    lpio GBUSR1, r0                     ; get the global counter, requested before - was a special pulse for full readout?
    and r0, c1, r0                      ; we expect 0 or 1
    sll 14, r0, r0                      ; shift to position 14 -> ignore_L1R
    or r0, r5, r5                       ; merge with the SML2 value
    sgio r5, SML2                       ; write the complete SML2 with ignore_L1R set (when the counter=1 was) or cleared (when counter=0 was)
    mov 0, r0                           ; prepare to clear the counter
    jmpr cc_busy, 0
    sgio r0, CTGDINI                    ; clear the counter
    #endif

    syn                                 ; wait eventually for cpu1,2 to finish with double tracklet detection, their delay is not predictable
    mov adc_ch_msk_1, r0                ; channel 1
    sub r0, 4, r0                       ; -4
    shl r0, r1, r0                      ; 0xF0 << (ch-4)
    or  r0, r2, r2                      ; set the bits from cpu1

    mov adc_ch_msk_2, r0                ; channel 2
    sub r0, 4, r0                       ; -4
    shl r0, r1, r0                      ; 0xF0 << (ch-4)
    or  r0, r2, r2                      ; now in adc_ch_mask are marked all channels, that should be integrated & read

    ; IMPORTANT!
    jmpr cc_busy, 0
    lpio GBUSR0, r1                     ; read from I/O space

    iext ADCMSK_DB                      ; refresh there
    sgio r1, ADCMSK_DB

    #ifgt TR_ADCMSK_AND, 0
    and r1, r2, r2                      ; and with the ADC channels mask to avoid reading masked channels
    #endif
    cmp r13, c7                         ; watch dog counter was 0, decremented is -1
    jmpr cc_neq, +2                     ; skip when not -1
    mov 0, r2                           ; clear the ADC mask if too many events with tracklets one after another
    mov r2, adc_ch_msk_3                ; write go a GRF register, this will release the other CPUs

    jmpr cc_busy, 0                     ; wait
    iext ADCMSK                         ; and write to the register
    sgio r1, ADCMSK                     ; so it doesn't need to be extra refreshed!

    ; !!! If some channel was masked, then it will contain just the baseline. It still can "contribute" to a tracklet?
    ; should we integrate it? We will loose the baseline there!
    ; if this is not desireble, remove the 2 lines of code above!

#else

_acq_no_tr_cont:                        ; cpu0..2
    ; at this point cpu0 is first, 1..2 are almost synchronous but about 10 clocks later, cpu3 has a delay of about 10+10 clocks
    ; here the CPUs without tracklets have CH_NR_NO_TRCK=23 (instead of 31) in the adc_ch_mask[i] register (i=0..2)
    mov 0xFF, r12
    #ifdef cpu0
    mov r12, charge_i                   ; 7..0
    #endif
    #ifdef cpu1
    sll 8, r12, charge_i                ; 15..8
    #endif
    #ifdef cpu2
    swp r12, charge_i                   ; 23..16
    #endif
    sem b0000_0000_1000_0000            ; cpu0..2 will wait until cpu3 writes something to g7 (adc_ch_msk_3)
    ; don't use r12 and r13 until the line with div r12, r3 in the fit part!
    mov 0, r12                          ; init some registers for later
    sll NACHKOMMAST, 1, r13             ; shift +1 to the left nachkommast-times, c1=1     ??? before we loaded c7 111.111 shifted left
                                        ; load 2**nachkommast                              ??? then it was -2**nachkommast
  #ifdef cpu0
    ; cpu0 will only read the parameters from DMEM and store in GRF registers
    mov  SCALE_Y_DM, r15                ; prepare for reading from DMEM
    nop

    lra  rr_dword, r1                   ; read scale_y
    lra+ rr_dword, r1
    mov  r1, scale_y

    lra  rr_dword, r1                   ; read offs_y
    lra+ rr_dword, r1
    mov  r1, offs_y

    lra  rr_dword, r1                   ; read scale_d
    lra+ rr_dword, r1
    mov  r1, scale_d

    #ifdef SCALE_Q_DM
    lra  rr_dword, r1                   ; read scale factor for Q, if used
    lra+ rr_dword, r1
    mov  r1, scale_q
    #else
    add r15, 4, r15
    #endif

  #endif

  #ifdef cpu1
    ; switch on the NIICE (NI input control ports enable) lvds cells
    mov 1, r0
    sgio r0, NIICE
  #endif

  #ifdef cpu2
    ; cpu2 will store back to DMEM the fit parameters, read by cpu0 before
    sem b0010_0000_0000_0000            ; cpu2 waits until cpu0 writes to g13=scale_d
    mov  SCALE_Y_DM, r15                ; prepare for reading from DMEM
    syn
    sem b0000_0000_1000_0000            ; cpu2 will wait until cpu3 writes something to g7 (adc_ch_msk_3)
    mov scale_y, r1
    sra+ r1
    mov offs_y, r1
    sra+ r1
    mov scale_d, r1
    sra+ r1
    #ifdef SCALE_Q_DM
    mov scale_q, r1
    sra+ r1
    #endif
    #ifdef DEFL_CR_DM
    nop
    nop  ; sra+ ???
    #endif
  #endif

    ; prepare for the charge integration, to shorten the whole duration. might be unused in case of no tracklets
    mov EBR0, r1                        ; cpu0..2 have 5 channels, cpu3 has 6 channels to read, each channel has 64 (0x40) ADC samples
    mov Q2_LEFT_MRG_VAL, r0             ; load the address of the 8-bit variable with the left integration margin
    add r1, r0, r1                      ; modify the start address EBR0 (timebin0) to the beginning of the integration window
    mov ADC_Q2, r15                     ; the start address in RAM to store the charge depends on the CPU# and is defined so
    mov 0x40, r2                        ; the increment in the event buffer, 64 samples/channel
    sub r15, 4, r15                     ; r15-=4, as the loop begins with increment, 4 as we have 4 bytes/dword
    mov EBR_CH_MSK, r4                  ; 0..4 or 5 bit set, rest 0

    syn                                 ; cpu0..2 will wait until cpu3 writes something to g7 (adc_ch_msk_3)
#endif
    ; here come all CPUs simultaneously! No matter if they have a tracklet or not!
    ; the global registers g0..g2 named charge_i are initialised with 0xFF << 8*CPU_ID

    shlt 0, adc_ch_msk_3                 ; test if the mask with all channels containing tracklet is 0
    jmp cc_zero, _acq_no_tr_mask         ; in this case no tracklet remained at all! this case is simple.


; I n t e g r a t i o n    in    Q2   w i n d o w
;
; this is done by all CPUs, as each CPU has access only to a subset of all ADC Event buffers
; integrate in a window defined as constants Q2_LEFT_MRG_VAL and Q2_WIN_WIDTH_VAL (2..9)
; read from the event buffer, accumulate and store to RAM

; it takes (2*WIDTH+9)*Nch+10 CPU Clocks, for CPU0..2 from about 10 to about 55+10*WIDTH, for CPU3 from about 10 to
; about 64+12*WIDTH
; this means, depending on the distribution of the marked channels, the CPUs will need very different number of clocks
; to finish this part!

#ifdef cpu0
    sem b0000_0000_0000_0001            ; cpu0 will wait for the sum of Q2 of its tracklet channels
#endif
#ifdef cpu1
    sem b0000_0000_0000_0010            ; cpu1 will wait for the sum of Q2 of its tracklet channels
#endif
#ifdef cpu2
    sem b0000_0000_0000_0100            ; cpu2 will wait for the sum of Q2 of its tracklet channels
#endif
#ifdef cpu3
    sem b0000_0111_0111_0000            ; cpu0..2 will store here something to say, we are ready with the charge integration
#endif

; inputs: r1 contains EBR0+Q2_LEFT_MRG_VAL
;         r15 contains the start address in RAM to store the sums (with offest +ch)
;         adc_ch_msk_3 is g7 and contains the ADC mask with the channels, used in the tracklets
;
; constants used:
;         EBR_CH_INI is the start channel # for each CPU: 0, 5, 10, 15
;         EBR_CH_MSK contains 1 1111 for CPU0..2 and 11 1111 for CPU3
;         Q2_WIN_WIDTH contains the width of the window
;
; modified: almost all reg, except for r12 and r13
;           no global regs modified
;           no constants modified

    ; for CPU0..2 this initialisation moved upwards to use better the time waiting for CPU3!
    ; for CPU3 the code remains here
#ifdef cpu3
    mov EBR0, r1                        ; cpu0..2 have 5 channels, cpu3 has 6 channels to read, each channel has 64 (0x40) ADC samples
    mov Q2_LEFT_MRG_VAL, r0             ; load the address of the 8-bit variable with the left integration margin
    add r1, r0, r1                      ; modify the start address EBR0 (timebin0) to the beginning of the integration window
    mov ADC_Q2, r15                     ; the start address in RAM to store the charge depends on the CPU# and is defined so
    mov 0x40, r2                        ; the increment in the event buffer, 64 samples/channel
    sub r15, 4, r15                     ; r15-=4, as the loop begins with increment, 4 as we have 4 bytes/dword
    mov EBR_CH_MSK, r4                  ; 0..4 or 5 bit set, rest 0
#endif

#inc "fit_q2_integrate.asm"
    ; at this point we have
    ;   in RAM at address ADC_Q2_0 an array with the integrated charges, updated only for the channels contributing to the tracklets
    ;   the others remain unchanged!

#ifNdef cpu3
    ; at the end of the integration, each CPU writes the same to its adc_ch_msk register (g4..7) to mark that it has finished.
    ; CPU3 must wait here until the bits 6..4 in its sync register are cleared. Of course it has finished the integration
    ; of its own channels.
    mov adc_ch_msk, adc_ch_msk          ; write the same, just to clear the corresponding bit in the sync mask
                                        ; the syn command is of CPU3
    ; here some CPUs come earlier than others!
#endif


#ifdef cpu3
    ; CPU3 will wait until CPU0..2 have finished the Q2 integration, then will calculate the total charge Q2
    ; for each tracklet and will store it to g0..2 for CPU0..2
    ; Note: sometimes cpu3 has more to do than the other CPUs!
#inc "fit_add_accs.asm"

; refresh the endmarkers and NI-enables (but for HCM and BM must be done separately)
; r3 counted in fit_add_accs how many (of the max 3) sums were NOT calculated, use this information to decide
; to make a small refresh
#ifdef REFR_NI
    cmp r3, 1
    jmp cc_ltu, _acq_skip_refr_ni
    ; r3 was initialised with 3 before adding the accumulated charges and was decremented by each existing tracklet
    ;
    ; do a small refresh, only when CPU3 has calculated less than 3 sums (there were less than 3 tracklets)
    ; otherwise may be it would delay the sending of the tracklets

    mova NSIG_TR_VAL, r1
    mova NSIG_RR_VAL, r2
    swp r2, r2
    or r1, r2, r1                       ; the full NES register has RR in bits 31..16 and TR in bits 15..0
    jmpr cc_busy, 0
    sgio r1, NES

    mov NIICE_VAL, r1
    jmpr cc_busy, 0
    sgio r1, NIICE

_acq_skip_refr_ni:
#endif
    syn                                 ; wait for the tracklets of cpu0..2
    iext 0x100001
    mov  0x100001, r6
    shl 8, r6, adc_ch_msk               ; load 0x1000_0100
#else
    ; NOW:
    ; cpu0..2 will check again if they have a tracklet and will eventually make the fit

    mov adc_ch_msk, r0                  ; the ADC start channel
    cmp r0, CH_NR_NO_TRCK               ; 23, used to mark no tracklet
    ; it might happen, that some CPU0..2 had to integrate, but has no tracklet, so it can finish now
    jmp cc_eq, _acq_no_tr_q2f           ; this CPU has no tracklet and has integrated its channels
                                        ; therefore skip the fit procedure!

#inc "fit_fit.asm"
    ; at the end CPU0..2 clear the bits 8..10 in the sync mask of CPU3
#endif

#ifndef cpu3
_acq_send_hdr_trackl:
    syn                                 ; only CPU0..2 need to be synchronized, cpu3 has just now released them
    ; here all CPUs must come synchronously, CPU0 needs that CPU1,2 are ready with the fit!
    ; g0..2 contain the full 20-bit charge word and is 0 in case of no tracklet
    ; g8..10 contain the 32-bit tracklets or are don't care when the corresponding charge word is 0.

    ; CPU0 will send the header and part of PID of all up to 3 tracklets
    ; CPU1 will send the tracklet of CPU0, CPU2 will send the tracklet of CPU1, CPU3 will send the tracklet of CPU2

    ; CPU0 has to get from each charge_i (i=0..2) the bits 19..12 to prepare the HPIDx in the header

    ; if no tracklets at all, but headers still wanted, all 3 charges will be 0 and only the header will be send

    ; MCM header: from MSBit left (bit 31) to the LSBit
    ; < 1 | padrow (4 bits) | column (2 bits) | HPID2 (8 bit) | HPID1 (8 bit) | HPID0 (8 bit) | 1 >
    ; HPIDx come from CPUx, if no tracklet from CPUx, then HPIDx is 0xFF. HPIDx = charge_i >> 12
    ; followed by 0 to 3 dwords, depending on the number of tracklets, in order from CPU0 to CPU2
    ; the header can be send optionally even when no tracklets are there.
    ; when the constant DONT_SEND_EMPTY_HDR_TR is 1, empty headers are suppressed.
    ;
    ; tracklet format, from MSBit left (bit 31) to the LSBit (0)
    ; < pad_position within the MCM (11 bit) | LPID (12 bit) | slope (8 bit) | 0 >

#endif
    ; the BM chips use the CPUs for nothing, but the NI there is not programmed to send own data.
    ; so it doesn't matter if they try to send something
    ; the HCM can send tracklets. This part is only for normal MCMs!
_acq_send_hdr_tr_ns:
#ifdef cpu0
    ; build first ( ( (charge_2[19..12] << 8) | charge_1[19..12]) << 8 ) | charge_0[19..12]
    ; if 0xFFFFFF, then we don't have any tracklets
    ; in this case depending on DONT_SEND_EMPTY_HDR_TR load the end marker or continue with the header
    ; r8 contains the HPID0, charge_1 is HPID1 << 8 and charge_2 is HPID2 << 16

    or r8, charge_2, r6
    or r6, charge_1, r6

    ; at this point all 3 charges (bits 19..12 of each charge word) are put together in bits 23..0
  #ifeq DONT_SEND_EMPTY_HDR_TR, 1
    ; r5 was loaded with 0xFFFFFF
    ; and if the 24-bit word with HPIDs is 0xFFFFFF, we don't have to send anything except for end markers
    cmp r6, r5
    jmp cc_eq, _acq_wr_em2ni            ; if = 0xFFFFFF we don't have any tracklet and write end marker
  #endif
    lgio 0, H_PAD_ROW_COL_DB
    shl 1, r6, r6                       ; otherwise put one '0' at the right side (prepared in pad_row_col)
    jmpr cc_busy, 0
    lpio GBUSR0, r5

    or r6, r5, r6                       ; and add the position information, already prepared in boot program
                                        ; (but needs to be refreshed periodically!)
    iext H_PAD_ROW_COL_DB
    sgio r5, H_PAD_ROW_COL_DB
    ; here CPU0 is ready
#endif

#ifdef cpu1                             ; send the tracklet of CPU0, if any, otherwise just end marker
    mov charge_0, r6
    cmp r6, 0xFF                        ; - test the same part =? 0xFF as used in the header
    jmp cc_eq, _acq_wr_em2ni            ; if = 0xFF we don't have a tracklet and write end marker
    mov trackl_0, r6                    ; load the 32-bit word to be send to r6
    xor r6, adc_ch_msk_3, r6            ; invert two bits in the tracklet, adc_ch_msk_3 initialised before by CPU3
#endif

#ifdef cpu2                             ; send the tracklet of CPU1, if any, otherwise just end marker
    slr 8, charge_1, r6                 ; and now they are shifted to bits 7..0
    cmp r6, 0xFF                        ; test the same part =? 0xFF as used in the header
    jmp cc_eq, _acq_wr_em2ni            ; if = 0xFF we don't have a tracklet and write end marker
    mov trackl_1, r6                    ; load the 32-bit word to be send to r6
    xor r6, adc_ch_msk_3, r6            ; invert two bits in the tracklet, adc_ch_msk_3 initialised before by CPU3
#endif

#ifdef cpu3                             ; send the tracklet of CPU2, if any, otherwise just end marker
    slr 16, charge_2, r6                ; and now they are shifted to bits 7..0
    cmp r6, 0xFF                        ; - test the same part =? 0xFF as used in the header
    jmp cc_eq, _acq_wr_em2ni            ; if = 0xFF we don't have a tracklet and write end marker
    mov trackl_2, r6                    ; load the 32-bit word to be send to r6
    xor r6, adc_ch_msk_3, r6            ; invert two bits in the tracklet, adc_ch_msk_3 initialised before by CPU3
#endif
    ; all CPUs send the prepared 32-bit word.
_acq_write2ni:
    spio r6, NODP
    sra r6, TrcklDMEMa                  ; store the tracklet in DMEM for debugging and as info for ZS readout
    jmp cc_uncond, clr_endloop

_acq_wr_em2ni:
    mova  NSIG_TR_VAL, r6
    jmp cc_uncond, _acq_write2ni


; for CPU0..2, who may calculate tracklets
#ifndef cpu3
; 1. the Fit Register File has no tracklet candidate for this CPU
; or
; 2. after check for double tracklets, this CPU has no tracklet more
_acq_no_tr_frf_dtr:
    mov CH_NR_NO_TRCK, adc_ch_msk
    jmp cc_uncond, _acq_no_tr_cont
    ; the CPUs should continue eventually to integrate the charge in the third window for another CPU(s) with tracklet


    ; finished with integration of Q2, but no own tracklet
_acq_no_tr_q2f:
    ; from here the CPU has to wait about 100 to 140 clocks
    ; if CPU2 comes here, it can do some refresh of very important parameters
    ; CPU1 will come rarely here
    ; CPU0 will come even rarely here
    ; inside the fit part found, that the slope is out of range

    #ifdef cpu1
    mvpcr +2, rstack
    ; small refresh
    jmp cc_uncond, load_direct_bm_m
    #else
    nop
    #endif

;   #ifdef cpu2
;   mvpcr +2, rstack
;   ; refresh 4 parameters from DMEM and 4 common constants from DBANK (as needed even on the HCM)
;   jmp cc_uncond, load_dm_par
;   #endif

    ; fit started, but the slope is out of range
_acq_out_rng:
    mov CH_NR_NO_TRCK, adc_ch_msk       ; to say to CPU3 that it doesn't need to calculate the sum of Q2

    ; refresh the own interrupt vectors
    mvpcr +2, rstack
    jmp cc_uncond, load_irq_vec

    syn                                 ; wait for CPU3, which adds the 4 integrated charges
    sem b0000_0000_1000_0000            ; prepare to wait for cpu3 is ready with Q2 of all CPU0..2
    jmp cc_uncond, _acq_no_tr_mask_nsyn


; ALL CPUs 0..2 do not have tracklets!!!
; 1 . some specific delay
; 2 . send eventually MCM tracklet header and endmarkers
; this time can be used to make some register recovery?
_acq_no_tr_mask:                        ; the adc mask containing the tracklet channels is 0
    mov 7, r7
    sll 1, c13, r1                      ; x2, c13 contains the counter for events without tracklets, used for refresh
    add r1, c13, r1                     ; x2 + x1 = x3
    and r7, r1, r7                      ; the lower 3 bits, so we have the sequence: 0, 3, 6, 1, 4, 7, 2, 5 => 0...
    cmp r7, 3                           ; 0..2 used to refresh TPL LUT
    jmp cc_gtu, _acq_refr_4_7
    jmp cc_eq,  _acq_refr_3
; here is 0..2
    ; 3 times TPL LUT
    #ifdef cpu0

    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_tpl             ; may be move to raw data readout, it is too long for here

    #endif

    #ifdef cpu1

    sem b1100_0000_0000_0000            ; wait for g14, g15
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_first_m  ; first do somthing else, CPU0 needs some time to prepare the loops

    mov _acq_no_tr_mask_nsyn, rstack    ; return address is the end of the refresh
    jmp cc_uncond, load_tpl_sec_cpu     ; prepare the address pointers and sync

    #endif

    #ifdef cpu2
    mov _acq_no_tr_mask_nsyn, rstack
    cmp r7, 1
;    jmp cc_ltu,    load_dm_par          ; 0    not necessary more
    jmp cc_gtu,    load_direct_bm_m     ; 0     do something else?
    jmp cc_gtu,    load_direct_bm_m     ; 2
    jmp cc_uncond, load_direct_adc      ; 1

    #endif

_acq_refr_4:
; here is 4
    #ifdef cpu0
    mvpcr +2, rstack
    jmp cc_uncond, load_pre_par
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_irq_vec
    #endif

    #ifdef cpu1
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_patchm_par
    #endif

    #ifdef cpu2
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_first_m
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_pre_clust_par
    #endif

_acq_refr_4_5:
    cmp r7, 4
    jmp cc_eq, _acq_refr_4
; here is 5
    #ifdef cpu0
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_all
    #endif

    #ifdef cpu1
    mvpcr +2, rstack
    jmp cc_uncond, load_lptc_fil
    #endif

    #ifdef cpu2
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_first_m
    #endif

    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_irq_vec

_acq_refr_7:
; here is 7
    #ifdef cpu0
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_gain_filg
    #endif

    #ifdef cpu1
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_first_m
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_irq_vec
    #endif

    #ifdef cpu2            ; these 2 are not necessary!
    sem b0000_0010_0000_0000            ; wait until CPU1 finished with refresh
;    mvpcr +2, rstack
;    jmp cc_uncond, load_dm_par
    syn
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_bm_m
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_irq_vec
    #endif

_acq_refr_4_7:
    cmp r7, 6
    jmp cc_ltu, _acq_refr_4_5
    jmp cc_gtu, _acq_refr_7
; here is 6

    #ifdef cpu0
    mvpcr +2, rstack
    jmp cc_uncond, load_pre_par
    mov 0, trackl_i                     ; write something, to release CPU1
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_irq_vec
    #endif

    #ifdef cpu1
    sem b0000_0001_0000_0000            ; wait until CPU0 finished with refresh
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_first_m
    syn
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_irq_vec
    #endif

    #ifdef cpu2
    mov _acq_no_tr_mask_nsyn, rstack
    jmp cc_uncond, load_direct_all
    #endif

_acq_refr_3:
; here is 3

    #ifdef cpu0
    mvpcr +2, rstack
    jmp cc_uncond, load_gain_fila
    #endif

    #ifdef cpu1
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_first_m
    mvpcr +2, rstack
    jmp cc_uncond, load_unused
;    mvpcr +2, rstack
;    jmp cc_uncond, load_irq_vec
    #endif

    #ifdef cpu2
    sem b0000_0010_0000_0000            ; wait until CPU1 finished with refresh
;    mvpcr +2, rstack                    ; these 2 are not necessary!
;    jmp cc_uncond, load_dm_par
    syn                                 ; wait until CPU1 finished with refresh
    mvpcr +2, rstack
    jmp cc_uncond, load_direct_bm_m
    mvpcr +2, rstack
    jmp cc_uncond, load_irq_vec
    #endif
    ; refresh the interrupt vectors, only when all CPUs don't have tracklets

_acq_no_tr_mask_nsyn:

#ifdef cpu0
  #ifeq DONT_SEND_EMPTY_HDR_TR, 1
    ; and if this word is 0xFFFFFF, we don't have to send anything except for end markers
    iext 0xFFFFFF                       ; 24-bits with 1's
    mov 0xFFFFFF, r5
  #endif
#endif

    sem b0000_0000_1000_0000            ; prepare to wait for cpu3 is ready with Q2 of all CPU0..2
    mov 0xFF, r8                        ; bits 19..12 shifted to 7..0, ok for charge_0
    #ifdef cpu0
    mov r8, charge_i                    ; r8 contains charge_0 only by cpu0
    #endif
    #ifdef cpu1
    sll 8, r8, charge_i                 ; to bits 15..8
    #endif
    #ifdef cpu2
    swp r8, charge_i                    ; to bits 23..16
    #endif
    mov 0, trackl_i                     ; write something, to release CPU3
    jmp cc_uncond, _acq_send_hdr_trackl ; after the jump CPU0..2 will be synced again and released by CPU3

#else
; CPU3
; ALL CPUs 0..2 do not have tracklets!!!
; 1 . some specific delay
; 2 . send eventually MCM tracklet header and endmarkers
; this time can be used to make some register recovery?
_acq_no_tr_mask:    ; the adc mask containing the tracklet channels is 0
                    ; entry points after decoding, that no tracklet should be calculated

    sem b0111

    ; refresh the interrupt vectors, only when all CPUs don't have tracklets
    mvpcr +2, rstack
    jmp cc_uncond, load_irq_vec

    mov MAX_TR_EVENT, r1
    sra r1, WDOG_DM

    ; increment the number of empty tracklet runs
    mov c13, r1
    add r1, 1, r1

    syn

    jmpr cc_busy, 0
    sgio r1, C13CPUA                    ; update c13 used to control which refresh routine to start
                                        ; the other CPUs are ready here, so the new value is for the next event
    mov 0, adc_ch_msk                   ; release the CPU0..2
    jmp cc_uncond, _acq_send_hdr_tr_ns  ; and jump to the exit without sync
#endif
    nop