; $Id$:

; TO DO: pad_position is not ready now!


; $Id$:
; this is include file for the main fit program

    ; the only jumps to outside this code are to
    ; _acq_out_rng when the deflection is out of range. Do we need this now?

    ; input: all fit registers
    ;       r12 = 0 in clear procedure
    ;       r13 = (1 << nachkomma) in clear procedure
    ; output:
    ;       32-bit tracklet word stored in trackl_0..2 (global reg 8..10)
    ;       20-bit charge   word stored in charge_0..2 (global reg 0.. 2)

                ;##############################################################
                ;#
                ;#      fit parameter calculation
                ;#
                ;#      SLOPE = ( N * XY - X * Y )  / ( N * XX - X * X )
                ;#      POS   = ( XX * Y - X * XY ) / ( N * XX - X * X )
                ;#
                ;##############################################################

        add     f1, f9, r15             ; N  = N0 + N1      - merge the number of hits
        add     f3, f11, r1             ; X  = X0 + X1      - merge sum of X
        mul32   r1, r1, r3              ;      X * X

        add     f4, f12, r2             ; XX = XX0 + XX1    - merge sum of X**2
        mul32   r15, r2, r4             ;      N * XX
        add     f2, f10, r0             ; Q  = Q0 + Q1      - merge sum of ADC values, bits 14..0 correspond to window0, bits 31..16 - to window1
        sub     r4, r3, r3              ; N * XX - X * X    - the result of the multiplication need 1 clock more!
                                        ; some initialized registers:
                                        ; r12 = 0
                                        ; r13 = (1 << nachkomma)
                                        ; div here divides a 64-bit signed integer composed of ([63..32]=r13, r12[31..0]=r12) to r3 (signed)
        div     r12, r3                 ; start 2**(32+nachkomma)/DENOMINATOR, ready in 19 clocks


        shl     PAD_BITS, f9, r5        ;      256 * N1                 ( 1)
        add     f13, r5, r5             ;      Y1 + 256 * N1            ( 2)
        add     f5, r5, r5              ; Y  = Y0 + Y1 + 256 * N1       ( 3)

        shl     PAD_BITS, f11, r6       ;      256 * X1                 ( 4)
        add     f14, r6, r6             ;      XY1 + 256 * X1           ( 5)
        add     f6, r6, r6              ; XY = XY0 + XY1 + 256 * X1     ( 6)

        mus32   r1, r5, r3              ;      X * Y                    ( 7)
        mus32   r1, r6, r4              ;      X * XY                   ( 8)
        mus32   r2, r5, r1              ;      XX * Y                   ( 9)
        mus32   r15, r6, r2             ;      N * XY                   (10)

        sub     r1, r4, r1              ; OF = XX * Y - X * XY          (11)
        sub     r2, r3, r2              ; SL = N * XY - X * Y           (12)


        ; the deflection table is at address 0x0C0 in DMEM (initialized at 0xC030 in GIO)
        ; these 4 instructions were placed later, but can be executed here to use some of the wait clocks
        mov DEFL_RNG_TBL_DM, r15        ; the start address of the min-max table in DMEM
        shl 3, f0, r7                   ; 2**3 (=8) * ch - the offset in the table, 2x32 bit entries (signed)/ch
        add r7, r15, r15                ; the address in DMEM

        shl     PAD_EXT, f0, r8         ; Ch(left)*2**PAD_EXT (PAD_EXT=5+8)
        sub r8, offs_y, r8              ; transfer to the middle of the chip, offs_y is g12 and is loaded before

        mov MSK_SLOPE, r5               ; needed later, here we need to wait anyway some clocks
        mov ROUNDING_ADD, r9            ; prepare for the rounding
        ; 13 clocks after start of division, 6 clocks more needed, just wait?
        jmpr cc_divb, 0                 ; wait about 2 clocks

        die     r7                      ; get 2**(32+nachkomma)/DENOMINATOR

        mus     r7, r1, r1              ; lower 32 bit word of POS, we use the upper bits
        mus     r7, r2, r2              ; lower 32 bit word of SLOPE, we use the upper bits in r13

        add     r13, r8,  r1            ; save the tracklet offset+(ch*pad_size-constant) offset in r1, it could be negativ!
        #ifeq INV_FIT_POS, 1            ; the counting of the channels will be reversed
        neg r1, r1
        #endif

        ;add     r13, defl_cor, r2       ; save the deflection+its correction in r2
        add     r13, c0, r2             ; save the deflection+its correction in r2

        ; here we have:
        ; offset in r1  (1 PAD = 256*2**nachkommast)
        ; slope  in r2  (1 PAD = 256*2**nachkommast) in PADS/timebin, may be 4-5 pads/30 timebins?

        mus     r1, scale_y, r1         ; scale properly, the factor is multiplied by 2**32
        mus     r2, scale_d, r2         ; the result is in bits 63..32 (r13)
        add     r13, r9, r1             ; rounding, save the result with one clock delay
        add     r13, r9, r2

        sar     NACHKOMMAST, r1, r1     ; remove the additional bits in position
        jmp cc_neg, _fit_fit_check_pos_neg
        ; r1 is positive: check if r1- position_max  >= 0
        jmpr cc_nzero, +3
        mov 1, r1
        jmp cc_uncond, _fit_fit_check_sl
        cmp r1, POS_MAX_ABS
        jmp cc_les, _fit_fit_check_sl   ; skip the rest if r1 <= POS_MAX_ABS
        mov POS_MAX_ABS, r1             ; clip the position in r1 to POS_MAX_ABS
        jmp cc_uncond, _fit_fit_check_sl

_fit_fit_check_pos_neg:
        mov POS_MAX_ABS, r13            ; the max absolute value of the position
        ; r1 is negative: check if r1+|position_min| >= 0
        add r1, r13, r13                ; add POS + POS_MAX_ABS
        jmp cc_ges, _fit_fit_check_sl   ; if the result is not negative, skip the correction
        sub r1, r13, r1                 ; when the result was negative, subtract it from the position => move the position towards 0
                                        ; exactly to -POS_MAX_ABS!
_fit_fit_check_sl:
        sar     NACHKOMMAST_S, r2, r2   ; ... in slope

        #ifeq INV_FIT_POS, 1            ; the counting of the channels will be reversed
        neg r2, r2                      ; not clear if necessary and if here or before the range check?
        #endif

        ; check if the slope is in the proper range
        lra     rr_dword, r13
        lra+    rr_dword, r13           ; the min limit
        cmp     r13, r2
        jmp     cc_gts, _acq_out_rng    ; jump out if min > slope
        lra     rr_dword, r13
        lra+    rr_dword, r13           ; the max limit
        cmp     r2, r13
        jmp     cc_gts, _acq_out_rng    ; jump out if slope > max

        ; artificially generate trackelt out of range condition
        ; in the corresponding CPU and so effectively suppress
        ; sending a tacklet but do most of the calculations
        #ifdef cpu0
            #ifeq DIS_CPU0_TRACKLETS, 1
            #INF Disable tracklets from CPU0 by generating out of range!
            jmp     cc_uncond, _acq_out_rng    ; don't send tracklet, but almost calculated
            #endif
        #endif
        #ifdef cpu1
            #ifeq DIS_CPU1_TRACKLETS, 1
            #INF Disable tracklets from CPU1 by generating out of range!
            jmp     cc_uncond, _acq_out_rng    ; don't send tracklet, but almost calculated
            #endif
        #endif
        #ifdef cpu2
            #ifeq DIS_CPU2_TRACKLETS, 1
            #INF Disable tracklets from CPU2 by generating out of range!
            jmp     cc_uncond, _acq_out_rng    ; don't send tracklet, but almost calculated
            #endif
        #endif

        ; here we have offset and slope in r1 and r2. r0 contains the merged Q0/1 (in bits 14..0 and 31..16)

        ; r2 is slope 8-bit signed and is checked to be within the limits
        ; r1 is offset, 11-bit signed and is checked to be within the correct limits
        ; the bits above bit the MSBit 10 in r1 don't need to be cleared explitictly!

        ; now the charges
        slr 16, r0, r4                  ; r4 contains Q1
        swp r0, r0
        slr 16, r0, r0                  ; r0 contains Q0
; coding of the charges, floating point, with 4 possible shifts, optimally selected
#ifeq DYN_FP_Q, 1   ; 1 clock long
        or r0, r4, r8                   ; r8 is Q0 | Q1
; or just multiplying to a constant
#else             ; 12 clocks long
        mul r0, scale_q, r0
        mul r4, scale_q, r4
        mov r13, r0
        mov r13, r4

        ; now check if Q0 (in r0) and Q1 (in r4) are up to 0x7F
        cmp r0, MSK_Q0Q1                ; the scaled Q0 and Q1 are 7-bit
        jmpr cc_leu, +2                 ; skip the next line if r0 <= 0x7F
        mov MSK_Q0Q1, r0                ; clip: r0 = 0x7F
        cmp r4, MSK_Q0Q1
        jmpr cc_leu, +2                 ; skip the next line if r4 <= 0x7F
        mov MSK_Q0Q1, r4                ; clip: r4 = 0x7F
        ; here r0 contains scaled and clipped Q0
        ;      r4 contains scaled and clipped Q1
        shl LEN_Q0Q1, r4, r4            ; sQ1 << 7
        or  r0, r4, r0                  ; sQ0 | (sQ1 << 7)
        ; here we need already Q2 to scale and clip it to 0x3F
#endif
        ; < pad_position within the MCM (11 bit) | LPID (12 bit) | slope (8 bit) | 0 >
        ; prepare some part of the tracklet
        shl 10, r1, r1
        shl 11, r1, r1                  ; shift the pad_poosition in r1 to bit 21: offset << 21
        and r2, r5, r2                  ; r2 is 8-bit signed, we need to clear the upper bits when negative, r5=MSK_SLOPE
        shl 1, r2, r2                   ; shift the slope to position 1  slope << 1
        or r1, r2, r1                   ;   (offset << 21) | (slope << 1)
        mov MSK_LPID, r5                ; 0xFFF, mask used later

        syn                             ; wait for CPU3, which adds the 4 integrated charges
        sem b0000_0000_1000_0000        ; cpu3 writes something to say all are ready with the tracklets

#ifeq DYN_FP_Q, 1     ; 26 clocks long + 1 before = 27
        mov 0x3F, r3                    ; may be paranoa, prepare for later
        or r8, charge_i, r8             ; here we have Q0 | Q1 | Q2

        ; now determine where is the MSBit set in this word!
        slr DYN_CHECK0, r8              ; check if when we get the bits 7..2, the remaining will be 0
        jmp cc_zero, _fit_fit_qrdy0

        slr DYN_CHECK1, r8              ; check if when we get the bits 9..4, the remaining will be 0
        jmp cc_zero, _fit_fit_qrdy1

        slr DYN_CHECK2, r8              ; check if when we get the bits 11..6, the remaining will be 0
        jmp cc_zero, _fit_fit_qrdy2

_fit_fit_qrdy3:
        slr DYN_SHIFT3, r0, r0          ; Q0
        slr DYN_SHIFT3, r4, r4          ; Q1
        slr DYN_SHIFT3, charge_i, r8    ; Q2
        and r0, r3, r0                  ; Q0 &= 0x3F, this is necessary here, as here we didn't check if the result is larger!
        and r4, r3, r4                  ; Q1 &= 0x3F
        and r8, r3, r8                  ; Q2 &= 0x3F
        sll DYN_SIZE, 3, r3             ; 3 << 6
        or r8, r3, r8                   ; Q2 | 3 << 6
        cmp r8, 0xFF                    ; compare with 0xFE, if r8 was 0xFF, then we get the carry=1
        adc r8, -1, r8                  ; and we add -1 with carry, so in case of 0xFF we get 0xFE, otherwise no modification!
        jmp cc_uncond, _fit_fit_qrdy

_fit_fit_qrdy0:
        slr DYN_SHIFT0, r0, r0          ; Q0
        slr DYN_SHIFT0, r4, r4          ; Q1
        slr DYN_SHIFT0, charge_i, r8    ; Q2
        ; masking with the max value is here not necessary, as we come here only if the shift above by DYN_CHECK0 resulted in a 0!
        jmp cc_uncond, _fit_fit_qrdy

_fit_fit_qrdy2:
        slr DYN_SHIFT2, r0, r0          ; Q0
        slr DYN_SHIFT2, r4, r4          ; Q1
        slr DYN_SHIFT2, charge_i, r8    ; Q2
        ; masking with the max value is here not necessary, as we come here only if the shift above by DYN_CHECK2 resulted in a 0!
        sll DYN_SIZE, 2, r3             ; 2 << 6
        or r8, r3, r8                   ; Q2 | 2 << 6
        jmp cc_uncond, _fit_fit_qrdy

_fit_fit_qrdy1:
        slr DYN_SHIFT1, r0, r0          ; Q0
        slr DYN_SHIFT1, r4, r4          ; Q1
        slr DYN_SHIFT1, charge_i, r8    ; Q2
        ; masking with the max value is here not necessary, as we come here only if the shift above by DYN_CHECK1 resulted in a 0!
        sll DYN_SIZE, 1, r3             ; 1 << 6
        or r8, r3, r8                   ; Q2 | 1 << 6

_fit_fit_qrdy:
        sll DYN_SIZE, r8, r9            ; (Sh, Q2) << 6
        or r9, r4, r9                   ; (Sh, Q2) << 6 | Q1
        sll DYN_SIZE, r9, r9            ; ( (Sh, Q2) << 6 | Q1 ) << 6
        or r9, r0, charge_i             ; ( ( (Sh, Q2) << 6 | Q1 ) << 6 ) | Q0
        ; here we have the full 20 bit charge word:
        ; shift(2) | Q2(6) | Q1(6) | Q0(6)
        ; r8 contains the upper 8 bits, we will need them later

#else           ; 9 clocks long + 12 before = 21
        mov charge_i, r3                ; move the charge to r3
        mul r3, scale_q, r3
        nop                             ; pipelining, remove later by rearranging
        sar 1, r13, r13                 ; div by 2, as one bit less
        cmp r13, MSK_Q2                 ; r13 is scaled Q2 and is 6-bit
        jmpr cc_ltu, +2                 ; skip the next line if r13 < 0x3F
        mov MAX_Q2, r13                 ; r3 = 0x3E
        ; here r13 is Q2 scaled and clipped to 0x3F
        ; buid the final charge word
        ; we want to build a 20-bit PID as (Q0 << 13) | (Q1 << 6) | Q2 and store it to charge_i register.
        shl 14, r13, r13                ; the length of sQ0 and sQ1 together is 7+7
                                        ; r0 contains already sQ0 | (sQ1 << 7)
        or  r13, r0, charge_i           ; (sQ2 << 14) | (sQ1 << 7) | sQ0
#endif
        ; so the floating point coding is 6 clocks longer than simply multiplying by a constant

        ; tracklet format, from MSBit left (bit 31) to the LSBit (0)
        ; < pad_position within the MCM (11 bit) | LPID (12 bit) | slope (8 bit) | 0 >

        and r5, charge_i, r5            ; LPID: the lower 12-bit of the 20-bit with charges
        shl 9, r5, r4                   ; LPID << 9, LPID is 12 bit, to the right is the 8-bit slope and one bit 0
        slr LEN_LPID, charge_i, r8      ; HPID is the upper 8-bit of the 20-bit charge word

#ifdef cpu0
  #ifeq DONT_SEND_EMPTY_HDR_TR, 1
    ; and if this word is 0xFFFFFF, we don't have to send anything except for end markers
    iext 0xFFFFFF                       ; 3 x 8 bits => 24-bits with 1s, used to check for no tracklets
    mov 0xFFFFFF, r5
  #endif
#endif
        #ifdef cpu0
        mov r8, charge_i                ; HPID0 is at bits 7..0
        #endif
        #ifdef cpu1
        sll 8, r8, charge_i             ; HPID1 is at bits 15..8
        #endif
        #ifdef cpu2
        swp r8, charge_i                ; HPID2 is at bits 23..16
        #endif

        #ifeq TRACKLETS_FROM_CON10, 1
        #INF Using C10 instead of tracklets!
        mov c10, trackl_i               ; programmed before through SCSN
        #else
        or r1, r4, trackl_i             ; (offset(10..0) << 21) | PID(11..0) << 9 | (slope(7..0) << 1)
        #endif

        ; here is trackl_i ready - the new tracklet. Use a global register, as
        ; - CPU0 will send the header with position and bits 19..12 of each charge word (g0..2)
        ;   if specified, header will be send only when at least one tracklet follows
        ; - CPU1 will send the tracklet of CPU0 (g8) if its charge word[19..12] =/= 0xFF
        ; - CPU2 will send the tracklet of CPU1 (g9) if its charge word[19..12] =/= 0xFF
        ; - CPU3 will send the tracklet of CPU2 (g10) if its charge word[19..12] =/= 0xFF