/**********************************************************************
This file is part of Crack dot Com's free source code release of Golgotha.
for information about compiling & licensing issues visit this URL
 If that doesn't help, contact Jonathan Clark at 
  golgotha_source@usa.net (Subject should have "GOLG" in it) 
***********************************************************************/

#include "software/r1_software_globals.hh"
#include "software/inline_fpu.hh"

extern sw32 had_subdivisions;

void texture_scanline_perspective_unlit(w16 *start_pixel,
                                        sw32 start_x,
                                        void *_left,//perspective_span *left,
                                        sw32 width)
{
  start_pixel = (w16 *)((w8 *)start_pixel + start_x);

  perspective_span *left = (perspective_span *)_left;

  _asm
  {
    //left_z = 1.f / left->ooz;
    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
    
    //sw32 had_subdivisions = width & (~15);
    //num_subdivisions = width >> 4;
    //num_leftover     = width & 15;
    
    mov esi,dword ptr [left]
    mov eax,dword ptr [width]

    fld1
    fdiv qword ptr [esi]perspective_span.ooz

    mov ebx,eax
    and eax,15

    shr ebx,4
    mov ecx,width

    and ecx,(~15)
    mov dword ptr [num_leftover],eax
    
    mov dword ptr [num_subdivisions],ebx
    mov dword ptr [had_subdivisions],ecx
    
    fld st(0)
        
    fmul dword ptr [esi]perspective_span.soz
    fxch st(1)
        
    fmul dword ptr [esi]perspective_span.toz
    fxch st(1)

    fistp dword ptr [left_s]
    fistp dword ptr [left_t]

    mov eax,dword ptr [cur_grads].s_adjust
    mov ebx,dword ptr [cur_grads].t_adjust

    add eax,dword ptr [left_s]
    add ebx,dword ptr [left_t]

    mov dword ptr [left_s],eax
    mov dword ptr [left_t],ebx
    
    //clear these out
    mov dword ptr [dsdx_frac],0
    mov dword ptr [dtdx_frac],0
  }

  if (num_subdivisions)
  {
    _asm
    {
      //ooz_right = left->ooz + (cur_grads.doozdxspan);
      //soz_right = left->soz + (cur_grads.dsozdxspan);
      //toz_right = left->toz + (cur_grads.dtozdxspan);

      mov esi,dword ptr [left]
      mov edi,dword ptr [start_pixel]

      fld qword ptr [esi]perspective_span.ooz
      fld dword ptr [esi]perspective_span.soz
      fld dword ptr [esi]perspective_span.toz

      //t s o
      fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
      fxch st(2)

      //o s t

      fadd qword ptr [cur_grads]tri_gradients.doozdxspan
      fxch st(1)

      //s o t

      fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
      fxch st(2)

      //t o s

      fstp dword ptr [toz_right]
      fxch st(1)

      //s o

      fstp dword ptr [soz_right]

      fstp dword ptr [ooz_right]
      
      //calculate the 1st right_z
      fld1
      fdiv dword ptr [ooz_right]

      //calculate starting fractional and integral values for s and t
      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
      //ecx = starting_s_coordinate << 16
      //edx = starting_t_coordinate << 16
      //dx  = starting_light_value

      mov esi,dword ptr [r1_software_texture_ptr]
      mov eax,dword ptr [left_s]

      shr esi,1
      mov ebx,dword ptr [left_t]
    
      sar eax,16
      mov edx,dword ptr [left_t]

      sar ebx,16
      add esi,eax

      mov cl,byte ptr [r1_software_twidth_log2]
      shl ebx,cl
      
      sal edx,16
      mov ecx,dword ptr [left_s]
    
      sal ecx,16
      add esi,ebx
    }

    while (num_subdivisions)
    {
      _asm
      {
        //right_s = qftoi(soz_right * right_z);
        //right_t = qftoi(toz_right * right_z);
        
        //right_z is in st0
        fld st(0)
        
        fmul dword ptr [soz_right]
        fxch st(1)
        
        fmul dword ptr [toz_right]
        fxch st(1)

        fistp dword ptr [right_s]
        fistp dword ptr [right_t]

      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
      //in the leftover span, calculate the end of that.

      //if (num_subdivisions!=1)
      //{
          cmp dword ptr [num_subdivisions],1
          je  last_subdivision
        
          //ooz_right += (cur_grads.doozdxspan);
          //soz_right += (cur_grads.dsozdxspan);
          //toz_right += (cur_grads.dtozdxspan);
          
          fld dword ptr [ooz_right]
          fadd qword ptr [cur_grads]tri_gradients.doozdxspan

          fld dword ptr [soz_right]
          fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
                                   
          fld dword ptr [toz_right]
          fadd dword ptr [cur_grads]tri_gradients.dtozdxspan

          fxch st(2)
          fstp dword ptr [ooz_right]

          fstp dword ptr [soz_right]

          fstp dword ptr [toz_right]

          fld1
          fdiv dword ptr [ooz_right]

          jmp not_last_subdivision
      //}
      //else
      //if (num_leftover > 1)
      //{

      last_subdivision:
          cmp dword ptr [num_leftover],1
          jle not_last_subdivision
        
          //calculate the right_z for the end of the leftover span
          //ooz_right += (cur_grads.doozdx * num_leftover);
          //soz_right += (cur_grads.dsozdx * num_leftover);
          //toz_right += (cur_grads.dtozdx * num_leftover);

          fild dword ptr [num_leftover]
          
          //todo: pipeline these fpu ops
          fld  qword ptr [cur_grads]tri_gradients.doozdx
          fmul st(0),st(1)
          fadd dword ptr [ooz_right]
          fstp dword ptr [ooz_right]

          fld  dword ptr [cur_grads]tri_gradients.dsozdx
          fmul st(0),st(1)
          fadd dword ptr [soz_right]
          fstp dword ptr [soz_right]

          fld  dword ptr [cur_grads]tri_gradients.dtozdx
          fmul st(0),st(1)
          fadd dword ptr [toz_right]
          fstp dword ptr [toz_right]

          fstp st(0) //nifty thing i found, a 1 cycle fpu pop
        
          fld1
          fdiv dword ptr [ooz_right]
      //}
            
      not_last_subdivision:
        //cap the right_s and right_t's so that they're valid

        mov eax,dword ptr [right_s]
        mov ebx,dword ptr [right_t]
        
        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
 
        //cap the right s and t
        cmp eax,0
        jge cmp_eax_high

        mov eax,0
        jmp cmp_ebx_low

      cmp_eax_high:
        cmp eax,dword ptr [s_mask]
        jle cmp_ebx_low

        mov eax,dword ptr [s_mask]

      cmp_ebx_low:
        cmp ebx,0
        jge cmp_ebx_high

        mov ebx,0
        jmp done_compare
      
      cmp_ebx_high:
        cmp ebx,dword ptr [t_mask]
        jle done_compare

        mov ebx,dword ptr [t_mask]

      done_compare:

        //store the right_s and right_t
        //so they can be copied into left_s and left_t at the end of the 16-pixel span
        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
        
        mov dword ptr [right_s],eax
        mov dword ptr [right_t],ebx

        sub eax,dword ptr [left_s]
        push ebp

        sar eax,4
        sub ebx,dword ptr [left_t]
        
        sar ebx,4
        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
        
        sar eax,16
        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
        
        sar ebx,16
        mov cl,byte ptr [r1_software_twidth_log2]
        
        shl ebx,cl

        add eax,ebx
        mov ebx,0

        //s_t_carry[1] = integral_dsdx + integral_dtdx< 1)
    {      
      if (had_subdivisions==0)
      {
        //calculate the right_z for the end of span
        ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
        soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
        toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
        
        //calculate the z at the right endpoint
        _asm fld1
        _asm fdiv dword ptr [ooz_right]
      }
      else
      {
        //the correct ending right_z is already being calculated
        //(see the if (num_subdivisions!=1) case above
      }

      _asm
      {
        //calculate starting fractional and integral values for s and t           
        
        mov esi,dword ptr [r1_software_texture_ptr]
        mov eax,dword ptr [left_s]

        shr esi,1
        mov ebx,dword ptr [left_t]
    
        sar eax,16
        mov edx,dword ptr [left_t]

        sar ebx,16
        add esi,eax

        mov cl,byte ptr [r1_software_twidth_log2]
        shl ebx,cl
      
        sal edx,16
        mov ecx,dword ptr [left_s]
      
        sal ecx,16
        add esi,ebx
        
        mov edi,dword ptr [start_pixel]

        //calculate the right endpoint
        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
        
        //right_z is in st0
        fld st(0)
        
        fmul dword ptr [soz_right]
        fxch st(1)
        
        fmul dword ptr [toz_right]
        fxch st(1)

        fistp dword ptr [right_s]
        fistp dword ptr [right_t]

        mov eax,dword ptr [right_s]
        mov ebx,dword ptr [right_t]
        
        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
 
        //cap the right s and t
        cmp eax,0
        jge cmp_eax_high_2

        mov eax,0
        jmp cmp_ebx_low_2

      cmp_eax_high_2:
        cmp eax,dword ptr [s_mask]
        jle cmp_ebx_low_2

        mov eax,dword ptr [s_mask]

      cmp_ebx_low_2:
        cmp ebx,0
        jge cmp_ebx_high_2

        mov ebx,0
        jmp done_compare_2
      
      cmp_ebx_high_2:
        cmp ebx,dword ptr [t_mask]
        jle done_compare_2

        mov ebx,dword ptr [t_mask]

      done_compare_2:
            
        //calculate the deltas (left to right)
        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
        
        push ebp
        mov ebp,num_leftover

        sub eax,dword ptr [left_s]
        sub ebx,dword ptr [left_t]

        mov dword ptr [temp_dsdx],eax
        mov dword ptr [temp_dtdx],ebx

        fild dword ptr [temp_dsdx]
        fild dword ptr [temp_dtdx]

        fmul dword ptr [inverse_leftover_lookup + ebp*4]
        fxch st(1)

        fmul dword ptr [inverse_leftover_lookup + ebp*4]
        fxch st(1)

        fistp dword ptr [temp_dtdx]
        fistp dword ptr [temp_dsdx]
        
        //calculate the fractional and integral delta vars
        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<>16) + ((temp_dtdx>>16)<>16) + ((left_t>>16)<