; $Id$: ; TO DO: pad_position is not ready now! ; $Id$: ; this is include file for the main fit program ; the only jumps to outside this code are to ; _acq_out_rng when the deflection is out of range. Do we need this now? ; input: all fit registers ; r12 = 0 in clear procedure ; r13 = (1 << nachkomma) in clear procedure ; output: ; 32-bit tracklet word stored in trackl_0..2 (global reg 8..10) ; 20-bit charge word stored in charge_0..2 (global reg 0.. 2) ;############################################################## ;# ;# fit parameter calculation ;# ;# SLOPE = ( N * XY - X * Y ) / ( N * XX - X * X ) ;# POS = ( XX * Y - X * XY ) / ( N * XX - X * X ) ;# ;############################################################## add f1, f9, r15 ; N = N0 + N1 - merge the number of hits add f3, f11, r1 ; X = X0 + X1 - merge sum of X mul32 r1, r1, r3 ; X * X add f4, f12, r2 ; XX = XX0 + XX1 - merge sum of X**2 mul32 r15, r2, r4 ; N * XX add f2, f10, r0 ; Q = Q0 + Q1 - merge sum of ADC values, bits 14..0 correspond to window0, bits 31..16 - to window1 sub r4, r3, r3 ; N * XX - X * X - the result of the multiplication need 1 clock more! ; some initialized registers: ; r12 = 0 ; r13 = (1 << nachkomma) ; div here divides a 64-bit signed integer composed of ([63..32]=r13, r12[31..0]=r12) to r3 (signed) div r12, r3 ; start 2**(32+nachkomma)/DENOMINATOR, ready in 19 clocks shl PAD_BITS, f9, r5 ; 256 * N1 ( 1) add f13, r5, r5 ; Y1 + 256 * N1 ( 2) add f5, r5, r5 ; Y = Y0 + Y1 + 256 * N1 ( 3) shl PAD_BITS, f11, r6 ; 256 * X1 ( 4) add f14, r6, r6 ; XY1 + 256 * X1 ( 5) add f6, r6, r6 ; XY = XY0 + XY1 + 256 * X1 ( 6) mus32 r1, r5, r3 ; X * Y ( 7) mus32 r1, r6, r4 ; X * XY ( 8) mus32 r2, r5, r1 ; XX * Y ( 9) mus32 r15, r6, r2 ; N * XY (10) sub r1, r4, r1 ; OF = XX * Y - X * XY (11) sub r2, r3, r2 ; SL = N * XY - X * Y (12) ; the deflection table is at address 0x0C0 in DMEM (initialized at 0xC030 in GIO) ; these 4 instructions were placed later, but can be executed here to use some of the wait clocks mov DEFL_RNG_TBL_DM, r15 ; the start address of the min-max table in DMEM shl 3, f0, r7 ; 2**3 (=8) * ch - the offset in the table, 2x32 bit entries (signed)/ch add r7, r15, r15 ; the address in DMEM shl PAD_EXT, f0, r8 ; Ch(left)*2**PAD_EXT (PAD_EXT=5+8) sub r8, offs_y, r8 ; transfer to the middle of the chip, offs_y is g12 and is loaded before mov MSK_SLOPE, r5 ; needed later, here we need to wait anyway some clocks mov ROUNDING_ADD, r9 ; prepare for the rounding ; 13 clocks after start of division, 6 clocks more needed, just wait? jmpr cc_divb, 0 ; wait about 2 clocks die r7 ; get 2**(32+nachkomma)/DENOMINATOR mus r7, r1, r1 ; lower 32 bit word of POS, we use the upper bits mus r7, r2, r2 ; lower 32 bit word of SLOPE, we use the upper bits in r13 add r13, r8, r1 ; save the tracklet offset+(ch*pad_size-constant) offset in r1, it could be negativ! #ifeq INV_FIT_POS, 1 ; the counting of the channels will be reversed neg r1, r1 #endif ;add r13, defl_cor, r2 ; save the deflection+its correction in r2 add r13, c0, r2 ; save the deflection+its correction in r2 ; here we have: ; offset in r1 (1 PAD = 256*2**nachkommast) ; slope in r2 (1 PAD = 256*2**nachkommast) in PADS/timebin, may be 4-5 pads/30 timebins? mus r1, scale_y, r1 ; scale properly, the factor is multiplied by 2**32 mus r2, scale_d, r2 ; the result is in bits 63..32 (r13) add r13, r9, r1 ; rounding, save the result with one clock delay add r13, r9, r2 sar NACHKOMMAST, r1, r1 ; remove the additional bits in position jmp cc_neg, _fit_fit_check_pos_neg ; r1 is positive: check if r1- position_max >= 0 jmpr cc_nzero, +3 mov 1, r1 jmp cc_uncond, _fit_fit_check_sl cmp r1, POS_MAX_ABS jmp cc_les, _fit_fit_check_sl ; skip the rest if r1 <= POS_MAX_ABS mov POS_MAX_ABS, r1 ; clip the position in r1 to POS_MAX_ABS jmp cc_uncond, _fit_fit_check_sl _fit_fit_check_pos_neg: mov POS_MAX_ABS, r13 ; the max absolute value of the position ; r1 is negative: check if r1+|position_min| >= 0 add r1, r13, r13 ; add POS + POS_MAX_ABS jmp cc_ges, _fit_fit_check_sl ; if the result is not negative, skip the correction sub r1, r13, r1 ; when the result was negative, subtract it from the position => move the position towards 0 ; exactly to -POS_MAX_ABS! _fit_fit_check_sl: sar NACHKOMMAST_S, r2, r2 ; ... in slope #ifeq INV_FIT_POS, 1 ; the counting of the channels will be reversed neg r2, r2 ; not clear if necessary and if here or before the range check? #endif ; check if the slope is in the proper range lra rr_dword, r13 lra+ rr_dword, r13 ; the min limit cmp r13, r2 jmp cc_gts, _acq_out_rng ; jump out if min > slope lra rr_dword, r13 lra+ rr_dword, r13 ; the max limit cmp r2, r13 jmp cc_gts, _acq_out_rng ; jump out if slope > max ; artificially generate trackelt out of range condition ; in the corresponding CPU and so effectively suppress ; sending a tacklet but do most of the calculations #ifdef cpu0 #ifeq DIS_CPU0_TRACKLETS, 1 #INF Disable tracklets from CPU0 by generating out of range! jmp cc_uncond, _acq_out_rng ; don't send tracklet, but almost calculated #endif #endif #ifdef cpu1 #ifeq DIS_CPU1_TRACKLETS, 1 #INF Disable tracklets from CPU1 by generating out of range! jmp cc_uncond, _acq_out_rng ; don't send tracklet, but almost calculated #endif #endif #ifdef cpu2 #ifeq DIS_CPU2_TRACKLETS, 1 #INF Disable tracklets from CPU2 by generating out of range! jmp cc_uncond, _acq_out_rng ; don't send tracklet, but almost calculated #endif #endif ; here we have offset and slope in r1 and r2. r0 contains the merged Q0/1 (in bits 14..0 and 31..16) ; r2 is slope 8-bit signed and is checked to be within the limits ; r1 is offset, 11-bit signed and is checked to be within the correct limits ; the bits above bit the MSBit 10 in r1 don't need to be cleared explitictly! ; now the charges slr 16, r0, r4 ; r4 contains Q1 swp r0, r0 slr 16, r0, r0 ; r0 contains Q0 ; coding of the charges, floating point, with 4 possible shifts, optimally selected #ifeq DYN_FP_Q, 1 ; 1 clock long or r0, r4, r8 ; r8 is Q0 | Q1 ; or just multiplying to a constant #else ; 12 clocks long mul r0, scale_q, r0 mul r4, scale_q, r4 mov r13, r0 mov r13, r4 ; now check if Q0 (in r0) and Q1 (in r4) are up to 0x7F cmp r0, MSK_Q0Q1 ; the scaled Q0 and Q1 are 7-bit jmpr cc_leu, +2 ; skip the next line if r0 <= 0x7F mov MSK_Q0Q1, r0 ; clip: r0 = 0x7F cmp r4, MSK_Q0Q1 jmpr cc_leu, +2 ; skip the next line if r4 <= 0x7F mov MSK_Q0Q1, r4 ; clip: r4 = 0x7F ; here r0 contains scaled and clipped Q0 ; r4 contains scaled and clipped Q1 shl LEN_Q0Q1, r4, r4 ; sQ1 << 7 or r0, r4, r0 ; sQ0 | (sQ1 << 7) ; here we need already Q2 to scale and clip it to 0x3F #endif ; < pad_position within the MCM (11 bit) | LPID (12 bit) | slope (8 bit) | 0 > ; prepare some part of the tracklet shl 10, r1, r1 shl 11, r1, r1 ; shift the pad_poosition in r1 to bit 21: offset << 21 and r2, r5, r2 ; r2 is 8-bit signed, we need to clear the upper bits when negative, r5=MSK_SLOPE shl 1, r2, r2 ; shift the slope to position 1 slope << 1 or r1, r2, r1 ; (offset << 21) | (slope << 1) mov MSK_LPID, r5 ; 0xFFF, mask used later syn ; wait for CPU3, which adds the 4 integrated charges sem b0000_0000_1000_0000 ; cpu3 writes something to say all are ready with the tracklets #ifeq DYN_FP_Q, 1 ; 26 clocks long + 1 before = 27 mov 0x3F, r3 ; may be paranoa, prepare for later or r8, charge_i, r8 ; here we have Q0 | Q1 | Q2 ; now determine where is the MSBit set in this word! slr DYN_CHECK0, r8 ; check if when we get the bits 7..2, the remaining will be 0 jmp cc_zero, _fit_fit_qrdy0 slr DYN_CHECK1, r8 ; check if when we get the bits 9..4, the remaining will be 0 jmp cc_zero, _fit_fit_qrdy1 slr DYN_CHECK2, r8 ; check if when we get the bits 11..6, the remaining will be 0 jmp cc_zero, _fit_fit_qrdy2 _fit_fit_qrdy3: slr DYN_SHIFT3, r0, r0 ; Q0 slr DYN_SHIFT3, r4, r4 ; Q1 slr DYN_SHIFT3, charge_i, r8 ; Q2 and r0, r3, r0 ; Q0 &= 0x3F, this is necessary here, as here we didn't check if the result is larger! and r4, r3, r4 ; Q1 &= 0x3F and r8, r3, r8 ; Q2 &= 0x3F sll DYN_SIZE, 3, r3 ; 3 << 6 or r8, r3, r8 ; Q2 | 3 << 6 cmp r8, 0xFF ; compare with 0xFE, if r8 was 0xFF, then we get the carry=1 adc r8, -1, r8 ; and we add -1 with carry, so in case of 0xFF we get 0xFE, otherwise no modification! jmp cc_uncond, _fit_fit_qrdy _fit_fit_qrdy0: slr DYN_SHIFT0, r0, r0 ; Q0 slr DYN_SHIFT0, r4, r4 ; Q1 slr DYN_SHIFT0, charge_i, r8 ; Q2 ; masking with the max value is here not necessary, as we come here only if the shift above by DYN_CHECK0 resulted in a 0! jmp cc_uncond, _fit_fit_qrdy _fit_fit_qrdy2: slr DYN_SHIFT2, r0, r0 ; Q0 slr DYN_SHIFT2, r4, r4 ; Q1 slr DYN_SHIFT2, charge_i, r8 ; Q2 ; masking with the max value is here not necessary, as we come here only if the shift above by DYN_CHECK2 resulted in a 0! sll DYN_SIZE, 2, r3 ; 2 << 6 or r8, r3, r8 ; Q2 | 2 << 6 jmp cc_uncond, _fit_fit_qrdy _fit_fit_qrdy1: slr DYN_SHIFT1, r0, r0 ; Q0 slr DYN_SHIFT1, r4, r4 ; Q1 slr DYN_SHIFT1, charge_i, r8 ; Q2 ; masking with the max value is here not necessary, as we come here only if the shift above by DYN_CHECK1 resulted in a 0! sll DYN_SIZE, 1, r3 ; 1 << 6 or r8, r3, r8 ; Q2 | 1 << 6 _fit_fit_qrdy: sll DYN_SIZE, r8, r9 ; (Sh, Q2) << 6 or r9, r4, r9 ; (Sh, Q2) << 6 | Q1 sll DYN_SIZE, r9, r9 ; ( (Sh, Q2) << 6 | Q1 ) << 6 or r9, r0, charge_i ; ( ( (Sh, Q2) << 6 | Q1 ) << 6 ) | Q0 ; here we have the full 20 bit charge word: ; shift(2) | Q2(6) | Q1(6) | Q0(6) ; r8 contains the upper 8 bits, we will need them later #else ; 9 clocks long + 12 before = 21 mov charge_i, r3 ; move the charge to r3 mul r3, scale_q, r3 nop ; pipelining, remove later by rearranging sar 1, r13, r13 ; div by 2, as one bit less cmp r13, MSK_Q2 ; r13 is scaled Q2 and is 6-bit jmpr cc_ltu, +2 ; skip the next line if r13 < 0x3F mov MAX_Q2, r13 ; r3 = 0x3E ; here r13 is Q2 scaled and clipped to 0x3F ; buid the final charge word ; we want to build a 20-bit PID as (Q0 << 13) | (Q1 << 6) | Q2 and store it to charge_i register. shl 14, r13, r13 ; the length of sQ0 and sQ1 together is 7+7 ; r0 contains already sQ0 | (sQ1 << 7) or r13, r0, charge_i ; (sQ2 << 14) | (sQ1 << 7) | sQ0 #endif ; so the floating point coding is 6 clocks longer than simply multiplying by a constant ; tracklet format, from MSBit left (bit 31) to the LSBit (0) ; < pad_position within the MCM (11 bit) | LPID (12 bit) | slope (8 bit) | 0 > and r5, charge_i, r5 ; LPID: the lower 12-bit of the 20-bit with charges shl 9, r5, r4 ; LPID << 9, LPID is 12 bit, to the right is the 8-bit slope and one bit 0 slr LEN_LPID, charge_i, r8 ; HPID is the upper 8-bit of the 20-bit charge word #ifdef cpu0 #ifeq DONT_SEND_EMPTY_HDR_TR, 1 ; and if this word is 0xFFFFFF, we don't have to send anything except for end markers iext 0xFFFFFF ; 3 x 8 bits => 24-bits with 1s, used to check for no tracklets mov 0xFFFFFF, r5 #endif #endif #ifdef cpu0 mov r8, charge_i ; HPID0 is at bits 7..0 #endif #ifdef cpu1 sll 8, r8, charge_i ; HPID1 is at bits 15..8 #endif #ifdef cpu2 swp r8, charge_i ; HPID2 is at bits 23..16 #endif #ifeq TRACKLETS_FROM_CON10, 1 #INF Using C10 instead of tracklets! mov c10, trackl_i ; programmed before through SCSN #else or r1, r4, trackl_i ; (offset(10..0) << 21) | PID(11..0) << 9 | (slope(7..0) << 1) #endif ; here is trackl_i ready - the new tracklet. Use a global register, as ; - CPU0 will send the header with position and bits 19..12 of each charge word (g0..2) ; if specified, header will be send only when at least one tracklet follows ; - CPU1 will send the tracklet of CPU0 (g8) if its charge word[19..12] =/= 0xFF ; - CPU2 will send the tracklet of CPU1 (g9) if its charge word[19..12] =/= 0xFF ; - CPU3 will send the tracklet of CPU2 (g10) if its charge word[19..12] =/= 0xFF