/*
===========================================================================

Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. 

This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").  

Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Doom 3 BFG Edition Source Code.  If not, see <http://www.gnu.org/licenses/>.

In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code.  If not, please request a copy in writing from id Software at the address below.

If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.

===========================================================================
*/
/*
================================================================================================
Contains the DxtEncoder implementation for SSE2.
================================================================================================
*/
#pragma hdrstop
#include "DXTCodec_local.h"
#include "DXTCodec.h"

#if defined( ID_WIN_X86_SSE2_INTRIN ) || ( ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) ) )

//#define TEST_COMPRESSION
#ifdef TEST_COMPRESSION
#include <malloc.h>
#endif

#define INSET_COLOR_SHIFT		4		// inset the bounding box with ( range >> shift )
#define INSET_ALPHA_SHIFT		5		// inset alpha channel

#define C565_5_MASK				0xF8	// 0xFF minus last three bits
#define C565_6_MASK				0xFC	// 0xFF minus last two bits

#define NVIDIA_7X_HARDWARE_BUG_FIX		// keep the DXT5 colors sorted as: max, min

#if !defined( R_SHUFFLE_D )
#define R_SHUFFLE_D( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#endif

typedef uint16	word;
typedef uint32	dword;

ALIGN16( static __m128i SIMD_SSE2_zero ) = { 0, 0, 0, 0 };
ALIGN16( static dword SIMD_SSE2_dword_byte_mask[4] ) = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
ALIGN16( static dword SIMD_SSE2_dword_word_mask[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF };
ALIGN16( static dword SIMD_SSE2_dword_red_mask[4] )   = { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF };
ALIGN16( static dword SIMD_SSE2_dword_green_mask[4] ) = { 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00 };
ALIGN16( static dword SIMD_SSE2_dword_blue_mask[4] )  = { 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000 };
ALIGN16( static dword SIMD_SSE2_dword_colorMask_1010[4] ) = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
ALIGN16( static dword SIMD_SSE2_dword_colorMask_0100[4] ) = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask0[4] ) = { 7<<0, 0, 7<<0, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask1[4] ) = { 7<<3, 0, 7<<3, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask2[4] ) = { 7<<6, 0, 7<<6, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask3[4] ) = { 7<<9, 0, 7<<9, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask4[4] ) = { 7<<12, 0, 7<<12, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask5[4] ) = { 7<<15, 0, 7<<15, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask6[4] ) = { 7<<18, 0, 7<<18, 0 };
ALIGN16( static dword SIMD_SSE2_dword_alpha_bit_mask7[4] ) = { 7<<21, 0, 7<<21, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask0[4] ) = { 3<<0, 0, 3<<0, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask1[4] ) = { 3<<2, 0, 3<<2, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask2[4] ) = { 3<<4, 0, 3<<4, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask3[4] ) = { 3<<6, 0, 3<<6, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask4[4] ) = { 3<<8, 0, 3<<8, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask5[4] ) = { 3<<10, 0, 3<<10, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask6[4] ) = { 3<<12, 0, 3<<12, 0 };
ALIGN16( static dword SIMD_SSE2_dword_color_bit_mask7[4] ) = { 3<<14, 0, 3<<14, 0 };
ALIGN16( static word SIMD_SSE2_word_0[8] ) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
ALIGN16( static word SIMD_SSE2_word_1[8] ) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 };
ALIGN16( static word SIMD_SSE2_word_2[8] ) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 };
ALIGN16( static word SIMD_SSE2_word_3[8] ) = { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 };
ALIGN16( static word SIMD_SSE2_word_7[8] ) = { 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007 };
ALIGN16( static word SIMD_SSE2_word_8[8] ) = { 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008 };
ALIGN16( static word SIMD_SSE2_word_31[8] ) = { 31, 31, 31, 31, 31, 31, 31, 31 };
ALIGN16( static word SIMD_SSE2_word_63[8] ) = { 63, 63, 63, 63, 63, 63, 63, 63 };
ALIGN16( static word SIMD_SSE2_word_127[8] ) = { 127, 127, 127, 127, 127, 127, 127, 127 };
ALIGN16( static word SIMD_SSE2_word_255[8] ) = { 255, 255, 255, 255, 255, 255, 255, 255 };
ALIGN16( static word SIMD_SSE2_word_center_128[8] ) = { 128, 128, 0, 0, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_div_by_3[8] ) = { (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1, (1<<16)/3+1 };
ALIGN16( static word SIMD_SSE2_word_div_by_6[8] ) = { (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1, (1<<16)/6+1 };
ALIGN16( static word SIMD_SSE2_word_div_by_14[8] ) = { (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1, (1<<16)/14+1 };
ALIGN16( static word SIMD_SSE2_word_scale_7_9_11_13[8] ) = { 7, 7, 9, 9, 11, 11, 13, 13 };
ALIGN16( static word SIMD_SSE2_word_scale_7_5_3_1[8] ) = { 7, 7, 5, 5, 3, 3, 1, 1 };
ALIGN16( static word SIMD_SSE2_word_scale_5_3_1[8] ) = { 5, 3, 1, 0, 5, 3, 1, 0 };
ALIGN16( static word SIMD_SSE2_word_scale_1_3_5[8] ) = { 1, 3, 5, 0, 1, 3, 5, 0 };
ALIGN16( static word SIMD_SSE2_word_insetShift[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgRound[8] ) = { ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_COLOR_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftUp[8] ) = { 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_COLOR_SHIFT, 1 << INSET_ALPHA_SHIFT, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgShiftDown[8] ) = { 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_COLOR_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgQuantMask[8] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0xFF };
ALIGN16( static word SIMD_SSE2_word_insetYCoCgRep[8] ) = { 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0, 1 << ( 16 - 5 ), 1 << ( 16 - 6 ), 1 << ( 16 - 5 ), 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Round[8] ) = { 0, ((1<<(INSET_COLOR_SHIFT-1))-1), 0, ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Mask[8] ) = { 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftUp[8] ) = { 1, 1 << INSET_COLOR_SHIFT, 1, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5ShiftDown[8] ) = { 0, 1 << ( 16 - INSET_COLOR_SHIFT ), 0, 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5QuantMask[8] ) = { 0xFF, C565_6_MASK, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
ALIGN16( static word SIMD_SSE2_word_insetNormalDXT5Rep[8] ) = { 0, 1 << ( 16 - 6 ), 0, 0, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcRound[8] ) = { ((1<<(INSET_ALPHA_SHIFT-1))-1), ((1<<(INSET_ALPHA_SHIFT-1))-1), 0, 0, 0, 0, 0, 0 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcMask[8] ) = { 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftUp[8] ) = { 1 << INSET_ALPHA_SHIFT, 1 << INSET_ALPHA_SHIFT, 1, 1, 1, 1, 1, 1 };
ALIGN16( static word SIMD_SSE2_word_insetNormal3DcShiftDown[8] ) = { 1 << ( 16 - INSET_ALPHA_SHIFT ), 1 << ( 16 - INSET_ALPHA_SHIFT ), 0, 0, 0, 0, 0, 0 };
ALIGN16( static byte SIMD_SSE2_byte_0[16] ) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_1[16] ) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
ALIGN16( static byte SIMD_SSE2_byte_2[16] ) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 };
ALIGN16( static byte SIMD_SSE2_byte_3[16] ) = { 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 };
ALIGN16( static byte SIMD_SSE2_byte_4[16] ) = { 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 };
ALIGN16( static byte SIMD_SSE2_byte_7[16] ) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 };
ALIGN16( static byte SIMD_SSE2_byte_8[16] ) = { 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 };
ALIGN16( static byte SIMD_SSE2_byte_not[16] ) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
ALIGN16( static byte SIMD_SSE2_byte_colorMask[16] ) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_colorMask2[16] ) = { 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_ctx1Mask[16] ) = { 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_diagonalMask[16] ) = { 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask0[16] ) = { 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask1[16] ) = { 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask2[16] ) = { 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask3[16] ) = { 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_scale_mask4[16] ) = { 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00 };
ALIGN16( static byte SIMD_SSE2_byte_minus_128_0[16] ) = { (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0, (byte)-128, (byte)-128, 0, 0 };

/*
========================
idDxtEncoder::ExtractBlock_SSE2

params:	inPtr		- input image, 4 bytes per pixel
paramO:	colorBlock	- 4*4 output tile, 4 bytes per pixel
========================
*/
ID_INLINE void idDxtEncoder::ExtractBlock_SSE2( const byte * inPtr, int width, byte * colorBlock ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	__asm {
		mov			esi, inPtr
		mov			edi, colorBlock
		mov			eax, width
		shl			eax, 2
		movdqa		xmm0, xmmword ptr [esi]
		movdqa		xmmword ptr [edi+ 0], xmm0
		movdqa		xmm1, xmmword ptr [esi+eax]			// + 4 * width
		movdqa		xmmword ptr [edi+16], xmm1
		movdqa		xmm2, xmmword ptr [esi+eax*2]		// + 8 * width
		add			esi, eax
		movdqa		xmmword ptr [edi+32], xmm2
		movdqa		xmm3, xmmword ptr [esi+eax*2]		// + 12 * width
		movdqa		xmmword ptr [edi+48], xmm3
	}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	*((__m128i *)(&colorBlock[ 0])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 0 ) );
	*((__m128i *)(&colorBlock[16])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 1 ) );
	*((__m128i *)(&colorBlock[32])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 2 ) );
	*((__m128i *)(&colorBlock[48])) = _mm_load_si128( (__m128i *)( inPtr + width * 4 * 3 ) );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::GetMinMaxBBox_SSE2

Takes the extents of the bounding box of the colors in the 4x4 block.

params:	colorBlock	- 4*4 input tile, 4 bytes per pixel
paramO:	minColor	- Min 4 byte output color
paramO:	maxColor	- Max 4 byte output color
========================
*/
ID_INLINE void idDxtEncoder::GetMinMaxBBox_SSE2( const byte * colorBlock, byte * minColor, byte * maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	__asm {
		mov			eax, colorBlock
		mov			esi, minColor
		mov			edi, maxColor
		movdqa		xmm0, xmmword ptr [eax+ 0]
		movdqa		xmm1, xmmword ptr [eax+ 0]
		pminub		xmm0, xmmword ptr [eax+16]
		pmaxub		xmm1, xmmword ptr [eax+16]
		pminub		xmm0, xmmword ptr [eax+32]
		pmaxub		xmm1, xmmword ptr [eax+32]
		pminub		xmm0, xmmword ptr [eax+48]
		pmaxub		xmm1, xmmword ptr [eax+48]
		pshufd		xmm3, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
		pshufd		xmm4, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
		pminub		xmm0, xmm3
		pmaxub		xmm1, xmm4
		pshuflw		xmm6, xmm0, R_SHUFFLE_D( 2, 3, 2, 3 )
		pshuflw		xmm7, xmm1, R_SHUFFLE_D( 2, 3, 2, 3 )
		pminub		xmm0, xmm6
		pmaxub		xmm1, xmm7
		movd		dword ptr [esi], xmm0
		movd		dword ptr [edi], xmm1
	}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
	__m128i block1 = *((__m128i *)(&colorBlock[16]));
	__m128i block2 = *((__m128i *)(&colorBlock[32]));
	__m128i block3 = *((__m128i *)(&colorBlock[48]));

	__m128i max1 = _mm_max_epu8( block0, block1 );
	__m128i min1 = _mm_min_epu8( block0, block1 );
	__m128i max2 = _mm_max_epu8( block2, block3 );
	__m128i min2 = _mm_min_epu8( block2, block3 );

	__m128i max3 = _mm_max_epu8( max1, max2 );
	__m128i min3 = _mm_min_epu8( min1, min2 );

	__m128i max4 = _mm_shuffle_epi32( max3, R_SHUFFLE_D( 2, 3, 2, 3 ) );
	__m128i min4 = _mm_shuffle_epi32( min3, R_SHUFFLE_D( 2, 3, 2, 3 ) );

	__m128i max5 = _mm_max_epu8( max3, max4 );
	__m128i min5 = _mm_min_epu8( min3, min4 );

	__m128i max6 = _mm_shufflelo_epi16( max5, R_SHUFFLE_D( 2, 3, 2, 3 ) );
	__m128i min6 = _mm_shufflelo_epi16( min5, R_SHUFFLE_D( 2, 3, 2, 3 ) );

	max6 = _mm_max_epu8( max5, max6 );
	min6 = _mm_min_epu8( min5, min6 );

	*((int *)maxColor) = _mm_cvtsi128_si32( max6 );
	*((int *)minColor) = _mm_cvtsi128_si32( min6 );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::InsetColorsBBox_SSE2
========================
*/
ID_INLINE void idDxtEncoder::InsetColorsBBox_SSE2( byte * minColor, byte * maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	__asm {
		mov			esi, minColor
		mov			edi, maxColor
		movd		xmm0, dword ptr [esi]
		movd		xmm1, dword ptr [edi]
		punpcklbw	xmm0, SIMD_SSE2_byte_0
		punpcklbw	xmm1, SIMD_SSE2_byte_0
		movdqa		xmm2, xmm1
		psubw		xmm2, xmm0
		pmulhw		xmm2, SIMD_SSE2_word_insetShift
		paddw		xmm0, xmm2
		psubw		xmm1, xmm2
		packuswb	xmm0, xmm0
		packuswb	xmm1, xmm1
		movd		dword ptr [esi], xmm0
		movd		dword ptr [edi], xmm1
	}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i min = _mm_cvtsi32_si128( *(int *)minColor );
	__m128i max = _mm_cvtsi32_si128( *(int *)maxColor );

	__m128i xmm0 = _mm_unpacklo_epi8( min, *(__m128i *)SIMD_SSE2_byte_0 );
	__m128i xmm1 = _mm_unpacklo_epi8( max, *(__m128i *)SIMD_SSE2_byte_0 );

	__m128i xmm2 = _mm_sub_epi16( xmm1, xmm0 );

	xmm2 = _mm_mulhi_epi16( xmm2, *(__m128i *)SIMD_SSE2_word_insetShift );

	xmm0 = _mm_add_epi16( xmm0, xmm2 );
	xmm1 = _mm_sub_epi16( xmm1, xmm2 );

	xmm0 = _mm_packus_epi16( xmm0, xmm0 );
	xmm1 = _mm_packus_epi16( xmm1, xmm1 );

	*((int *)minColor) = _mm_cvtsi128_si32( xmm0 );
	*((int *)maxColor) = _mm_cvtsi128_si32( xmm1 );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::EmitColorIndices_SSE2

params:	colorBlock	- 16 pixel block for which to find color indices
paramO:	minColor	- Min alpha found
paramO:	maxColor	- Max alpha found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitColorIndices_SSE2( const byte * colorBlock, const byte * minColor_, const byte * maxColor_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	ALIGN16( byte color0[16] );
	ALIGN16( byte color1[16] );
	ALIGN16( byte color2[16] );
	ALIGN16( byte color3[16] );
	ALIGN16( byte result[16] );
	byte *outPtr = outData;

	__asm {
		mov			esi, maxColor_
		mov			edi, minColor_
		pxor		xmm7, xmm7
		movdqa		result, xmm7

		movd		xmm0, dword ptr [esi]
		pand		xmm0, SIMD_SSE2_byte_colorMask
		punpcklbw	xmm0, xmm7
		pshuflw		xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
		pshuflw		xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
		psrlw		xmm4, 5
		psrlw		xmm5, 6
		por			xmm0, xmm4
		por			xmm0, xmm5

		movd		xmm1, dword ptr [edi]
		pand		xmm1, SIMD_SSE2_byte_colorMask
		punpcklbw	xmm1, xmm7
		pshuflw		xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
		pshuflw		xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
		psrlw		xmm4, 5
		psrlw		xmm5, 6
		por			xmm1, xmm4
		por			xmm1, xmm5

		movdqa		xmm2, xmm0
		packuswb	xmm2, xmm7
		pshufd		xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color0, xmm2

		movdqa		xmm6, xmm0
		paddw		xmm6, xmm0
		paddw		xmm6, xmm1
		pmulhw		xmm6, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
		packuswb	xmm6, xmm7
		pshufd		xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color2, xmm6

		movdqa		xmm3, xmm1
		packuswb	xmm3, xmm7
		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color1, xmm3

		paddw		xmm1, xmm1
		paddw		xmm0, xmm1
		pmulhw		xmm0, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
		packuswb	xmm0, xmm7
		pshufd		xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color3, xmm0

		mov			eax, 32
		mov			esi, colorBlock

	loop1:			// iterates 2 times
		movq		xmm3, qword ptr [esi+eax+0]
		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm4, SIMD_SSE2_dword_0
		movq		xmm5, qword ptr [esi+eax+8]
		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm5, SIMD_SSE2_dword_0

		movdqa		xmm0, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm0, color0
		psadbw		xmm6, color0
		packssdw	xmm0, xmm6
		movdqa		xmm1, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm1, color1
		psadbw		xmm6, color1
		packssdw	xmm1, xmm6
		movdqa		xmm2, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm2, color2
		psadbw		xmm6, color2
		packssdw	xmm2, xmm6
		psadbw		xmm3, color3
		psadbw		xmm5, color3
		packssdw	xmm3, xmm5

		movq		xmm4, qword ptr [esi+eax+16]
		pshufd		xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
		movq		xmm5, qword ptr [esi+eax+24]
		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )

		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color0
		psadbw		xmm7, color0
		packssdw	xmm6, xmm7
		packssdw	xmm0, xmm6				// d1
		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color1
		psadbw		xmm7, color1
		packssdw	xmm6, xmm7
		packssdw	xmm1, xmm6				// d1
		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color2
		psadbw		xmm7, color2
		packssdw	xmm6, xmm7
		packssdw	xmm2, xmm6				// d2
		psadbw		xmm4, color3
		psadbw		xmm5, color3
		packssdw	xmm4, xmm5
		packssdw	xmm3, xmm4				// d3

		movdqa		xmm7, result
		pslld		xmm7, 16

		movdqa		xmm4, xmm0
		movdqa		xmm5, xmm1
		pcmpgtw		xmm0, xmm3				// b0
		pcmpgtw		xmm1, xmm2				// b1
		pcmpgtw		xmm4, xmm2				// b2
		pcmpgtw		xmm5, xmm3				// b3
		pcmpgtw		xmm2, xmm3				// b4
		pand		xmm4, xmm1				// x0
		pand		xmm5, xmm0				// x1
		pand		xmm2, xmm0				// x2
		por			xmm4, xmm5
		pand		xmm2, SIMD_SSE2_word_1
		pand		xmm4, SIMD_SSE2_word_2
		por			xmm2, xmm4

		pshufd		xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
		punpcklwd	xmm2, SIMD_SSE2_word_0
		punpcklwd	xmm5, SIMD_SSE2_word_0
		pslld		xmm5, 8
		por			xmm7, xmm5
		por			xmm7, xmm2
		movdqa		result, xmm7

		sub			eax, 32
		jge			loop1

		mov			esi, outPtr
		pshufd		xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
		pshufd		xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
		pshufd		xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
		pslld		xmm4, 2
		pslld		xmm5, 4
		pslld		xmm6, 6
		por			xmm7, xmm4
		por			xmm7, xmm5
		por			xmm7, xmm6
		movd		dword ptr [esi], xmm7
	}

	outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128c zero = SIMD_SSE2_zero;
	__m128c result = SIMD_SSE2_zero;
	__m128c color0, color1, color2, color3;
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
	__m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
	__m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
	__m128c blocka[2], blockb[2];
	blocka[0] = *((__m128i *)(&colorBlock[ 0]));
	blocka[1] = *((__m128i *)(&colorBlock[32]));
	blockb[0] = *((__m128i *)(&colorBlock[16]));
	blockb[1] = *((__m128i *)(&colorBlock[48]));

	temp0 = _mm_and_si128( maxColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
	temp0 = _mm_unpacklo_epi8( temp0, zero );
	temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
	temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
	temp4 = _mm_srli_epi16( temp4, 5 );
	temp5 = _mm_srli_epi16( temp5, 6 );
	temp0 = _mm_or_si128( temp0, temp4 );
	temp0 = _mm_or_si128( temp0, temp5 );


	temp1 = _mm_and_si128( minColor, (const __m128i &)SIMD_SSE2_byte_colorMask );
	temp1 = _mm_unpacklo_epi8( temp1, zero );
	temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
	temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
	temp4 = _mm_srli_epi16( temp4, 5 );
	temp5 = _mm_srli_epi16( temp5, 6 );
	temp1 = _mm_or_si128( temp1, temp4 );
	temp1 = _mm_or_si128( temp1, temp5 );


	temp2 = _mm_packus_epi16( temp0, zero );
	color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp6 = _mm_add_epi16( temp0, temp0 );
	temp6 = _mm_add_epi16( temp6, temp1 );
	temp6 = _mm_mulhi_epi16( temp6, (const __m128i &)SIMD_SSE2_word_div_by_3 );		// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
	temp6 = _mm_packus_epi16( temp6, zero );
	color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp3 = _mm_packus_epi16( temp1, zero );
	color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp1 = _mm_add_epi16( temp1, temp1 );
	temp0 = _mm_add_epi16( temp0, temp1 );
	temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 );		// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
	temp0 = _mm_packus_epi16( temp0, zero );
	color3 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	for ( int i = 1; i >= 0; i-- ) {
		// Load block
		temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
		temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );

		temp0 = _mm_sad_epu8( temp3, color0 );
		temp6 = _mm_sad_epu8( temp5, color0 );
		temp0 = _mm_packs_epi32( temp0, temp6 );

		temp1 = _mm_sad_epu8( temp3, color1 );
		temp6 = _mm_sad_epu8( temp5, color1 );
		temp1 = _mm_packs_epi32( temp1, temp6 );

		temp2 = _mm_sad_epu8( temp3, color2 );
		temp6 = _mm_sad_epu8( temp5, color2 );
		temp2 = _mm_packs_epi32( temp2, temp6 );

		temp3 = _mm_sad_epu8( temp3, color3 );
		temp5 = _mm_sad_epu8( temp5, color3 );
		temp3 = _mm_packs_epi32( temp3, temp5 );

		// Load block
		temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
		temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );

		temp6 = _mm_sad_epu8( temp4, color0 );
		temp7 = _mm_sad_epu8( temp5, color0 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp0 = _mm_packs_epi32( temp0, temp6 );	// d0

		temp6 = _mm_sad_epu8( temp4, color1 );
		temp7 = _mm_sad_epu8( temp5, color1 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp1 = _mm_packs_epi32( temp1, temp6 );	// d1

		temp6 = _mm_sad_epu8( temp4, color2 );
		temp7 = _mm_sad_epu8( temp5, color2 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp2 = _mm_packs_epi32( temp2, temp6 );	// d2

		temp4 = _mm_sad_epu8( temp4, color3 );
		temp5 = _mm_sad_epu8( temp5, color3 );
		temp4 = _mm_packs_epi32( temp4, temp5 );
		temp3 = _mm_packs_epi32( temp3, temp4 );	// d3

		temp7 = _mm_slli_epi32( result, 16 );

		temp4 = _mm_cmpgt_epi16( temp0, temp2 );	// b2
		temp5 = _mm_cmpgt_epi16( temp1, temp3 );	// b3
		temp0 = _mm_cmpgt_epi16( temp0, temp3 );	// b0
		temp1 = _mm_cmpgt_epi16( temp1, temp2 );	// b1
		temp2 = _mm_cmpgt_epi16( temp2, temp3 );	// b4

		temp4 = _mm_and_si128( temp4, temp1 );		// x0
		temp5 = _mm_and_si128( temp5, temp0 );		// x1
		temp2 = _mm_and_si128( temp2, temp0 );		// x2
		temp4 = _mm_or_si128( temp4, temp5 );
		temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
		temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
		temp2 = _mm_or_si128( temp2, temp4 );

		temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
		temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
		temp5 = _mm_slli_epi32( temp5, 8 );
		temp7 = _mm_or_si128( temp7, temp5 );
		result = _mm_or_si128( temp7, temp2 );
	}

	temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
	temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
	temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
	temp4 = _mm_slli_epi32( temp4, 2 );
	temp5 = _mm_slli_epi32( temp5, 4 );
	temp6 = _mm_slli_epi32( temp6, 6 );
	temp7 = _mm_or_si128( result, temp4 );
	temp7 = _mm_or_si128( temp7, temp5 );
	temp7 = _mm_or_si128( temp7, temp6 );

	unsigned int out = _mm_cvtsi128_si32( temp7 );
	EmitUInt( out );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::EmitColorAlphaIndices_SSE2

params:	colorBlock	- 16 pixel block for which find color indexes
paramO:	minColor	- Min color found
paramO:	maxColor	- Max color found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitColorAlphaIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	ALIGN16( byte color0[16] );
	ALIGN16( byte color1[16] );
	ALIGN16( byte color2[16] );
	ALIGN16( byte color3[16] );
	ALIGN16( byte result[16] );
	byte *outPtr = outData;

	__asm {
		mov			esi, maxColor_
		mov			edi, minColor_
		pxor		xmm7, xmm7
		movdqa		result, xmm7

		movd		xmm0, dword ptr [esi]
		pand		xmm0, SIMD_SSE2_byte_colorMask
		punpcklbw	xmm0, xmm7
		pshuflw		xmm4, xmm0, R_SHUFFLE_D( 0, 3, 2, 3 )
		pshuflw		xmm5, xmm0, R_SHUFFLE_D( 3, 1, 3, 3 )
		psrlw		xmm4, 5
		psrlw		xmm5, 6
		por			xmm0, xmm4
		por			xmm0, xmm5

		movd		xmm1, dword ptr [edi]
		pand		xmm1, SIMD_SSE2_byte_colorMask
		punpcklbw	xmm1, xmm7
		pshuflw		xmm4, xmm1, R_SHUFFLE_D( 0, 3, 2, 3 )
		pshuflw		xmm5, xmm1, R_SHUFFLE_D( 3, 1, 3, 3 )
		psrlw		xmm4, 5
		psrlw		xmm5, 6
		por			xmm1, xmm4
		por			xmm1, xmm5

		movdqa		xmm2, xmm0
		packuswb	xmm2, xmm7
		pshufd		xmm2, xmm2, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color0, xmm2

		movdqa		xmm6, xmm0
		paddw		xmm6, xmm1
		psrlw		xmm6, 1
		packuswb	xmm6, xmm7
		pshufd		xmm6, xmm6, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color2, xmm6

		movdqa		xmm3, xmm1
		packuswb	xmm3, xmm7
		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color1, xmm3

		movdqa		color3, xmm7

		mov			eax, 32
		mov			esi, colorBlock

	loop1:			// iterates 2 times
		movq		xmm3, qword ptr [esi+eax+0]
		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )
		movq		xmm5, qword ptr [esi+eax+8]
		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )

		movdqa		xmm0, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm0, color0
		psadbw		xmm6, color0
		packssdw	xmm0, xmm6
		movdqa		xmm1, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm1, color1
		psadbw		xmm6, color1
		packssdw	xmm1, xmm6
		movdqa		xmm2, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm2, color2
		psadbw		xmm6, color2
		packssdw	xmm2, xmm6

		shufps		xmm3, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
		psrld		xmm3, 24
		packssdw	xmm3, xmm3

		movq		xmm4, qword ptr [esi+eax+16]
		pshufd		xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
		movq		xmm5, qword ptr [esi+eax+24]
		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )

		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color0
		psadbw		xmm7, color0
		packssdw	xmm6, xmm7
		packssdw	xmm0, xmm6					// d1
		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color1
		psadbw		xmm7, color1
		packssdw	xmm6, xmm7
		packssdw	xmm1, xmm6					// d1
		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color2
		psadbw		xmm7, color2
		packssdw	xmm6, xmm7
		packssdw	xmm2, xmm6					// d2

		shufps		xmm4, xmm5, R_SHUFFLE_D( 0, 2, 0, 2 )
		psrld		xmm4, 24
		packssdw	xmm4, xmm4

		punpcklqdq	xmm3, xmm4					// c3

		movdqa		xmm7, result
		pslld		xmm7, 16

		movdqa		xmm4, xmm2
		pcmpgtw		xmm2, xmm0					// b0
		pcmpgtw		xmm4, xmm1					// b1
		pcmpgtw		xmm1, xmm0					// b2
		pmaxsw		xmm3, SIMD_SSE2_word_127	// b3
		pcmpeqw		xmm3, SIMD_SSE2_word_127

		pand		xmm2, xmm4
		por			xmm2, xmm3					// b0 & b1 | b3
		pxor		xmm1, xmm4
		por			xmm1, xmm3					// b2 ^ b1 | b3
		pand		xmm2, SIMD_SSE2_word_2
		pand		xmm1, SIMD_SSE2_word_1
		por			xmm2, xmm1

		pshufd		xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
		punpcklwd	xmm2, SIMD_SSE2_word_0
		punpcklwd	xmm5, SIMD_SSE2_word_0
		pslld		xmm5, 8
		por			xmm7, xmm5
		por			xmm7, xmm2
		movdqa		result, xmm7

		sub			eax, 32
		jge			loop1

		mov			esi, outPtr
		pshufd		xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
		pshufd		xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
		pshufd		xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
		pslld		xmm4, 2
		pslld		xmm5, 4
		pslld		xmm6, 6
		por			xmm7, xmm4
		por			xmm7, xmm5
		por			xmm7, xmm6
		movd		dword ptr [esi], xmm7
	}

	outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128c zero = SIMD_SSE2_zero;
	__m128c result = SIMD_SSE2_zero;
	__m128c color0, color1, color2;
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
	__m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
	__m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
	__m128c blocka[2], blockb[2];
	blocka[0] = *((__m128i *)(&colorBlock[ 0]));
	blocka[1] = *((__m128i *)(&colorBlock[32]));
	blockb[0] = *((__m128i *)(&colorBlock[16]));
	blockb[1] = *((__m128i *)(&colorBlock[48]));

	temp0 = _mm_and_si128( maxColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
	temp0 = _mm_unpacklo_epi8( temp0, zero );
	temp4 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 0, 3, 2, 3 ) );
	temp5 = _mm_shufflelo_epi16( temp0, R_SHUFFLE_D( 3, 1, 3, 3 ) );
	temp4 = _mm_srli_epi16( temp4, 5 );
	temp5 = _mm_srli_epi16( temp5, 6 );
	temp0 = _mm_or_si128( temp0, temp4 );
	temp0 = _mm_or_si128( temp0, temp5 );

	temp1 = _mm_and_si128( minColor, *(__m128c*)SIMD_SSE2_byte_colorMask );
	temp1 = _mm_unpacklo_epi8( temp1, zero );
	temp4 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 3, 2, 3 ) );
	temp5 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 3, 1, 3, 3 ) );
	temp4 = _mm_srli_epi16( temp4, 5 );
	temp5 = _mm_srli_epi16( temp5, 6 );
	temp1 = _mm_or_si128( temp1, temp4 );
	temp1 = _mm_or_si128( temp1, temp5 );

	temp2 = _mm_packus_epi16( temp0, zero );
	color0 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp6 = _mm_add_epi16( temp0, temp0 );
	temp6 = _mm_srli_epi16( temp6, 1 );			// diff from color
	temp6 = _mm_packus_epi16( temp6, zero );
	color2 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp3 = _mm_packus_epi16( temp1, zero );
	color1 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	// not used
	//color3 = zero;

	for ( int i = 1; i >= 0; i-- ) {
		// Load block
		temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
		temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );

		temp0 = _mm_sad_epu8( temp3, color0 );
		temp6 = _mm_sad_epu8( temp5, color0 );
		temp0 = _mm_packs_epi32( temp0, temp6 );

		temp1 = _mm_sad_epu8( temp3, color1 );
		temp6 = _mm_sad_epu8( temp5, color1 );
		temp1 = _mm_packs_epi32( temp1, temp6 );

		temp2 = _mm_sad_epu8( temp3, color2 );
		temp6 = _mm_sad_epu8( temp5, color2 );
		temp2 = _mm_packs_epi32( temp2, temp6 );


		// diff from color
		temp3 = _mm_shuffle_ps( temp3, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) );
		temp3 = _mm_srli_epi32( temp3, 24 );
		temp3 = _mm_packs_epi32( temp3, temp3 );


		// Load block
		temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
		temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );

		temp6 = _mm_sad_epu8( temp4, color0 );
		temp7 = _mm_sad_epu8( temp5, color0 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp0 = _mm_packs_epi32( temp0, temp6 );	// d0

		temp6 = _mm_sad_epu8( temp4, color1 );
		temp7 = _mm_sad_epu8( temp5, color1 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp1 = _mm_packs_epi32( temp1, temp6 );	// d1

		temp6 = _mm_sad_epu8( temp4, color2 );
		temp7 = _mm_sad_epu8( temp5, color2 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp2 = _mm_packs_epi32( temp2, temp6 );	// d2


		// diff from color
		temp4 = _mm_shuffle_ps( temp4, temp5, R_SHUFFLE_D( 0, 2, 0, 2 ) );		// c3
		temp4 = _mm_srli_epi32( temp4, 24 );
		temp4 = _mm_packs_epi32( temp4, temp4 );
		temp3 = _mm_unpacklo_epi64( temp3, temp4 );

		temp7 = _mm_slli_epi32( result, 16 );


		// diff from color
		temp4 = _mm_cmpgt_epi16( temp2, temp1 );						// b1
		temp2 = _mm_cmpgt_epi16( temp2, temp0 );						// b0
		temp1 = _mm_cmpgt_epi16( temp1, temp0 );						// b2
		temp3 = _mm_max_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 );	// b3
		temp3 = _mm_cmpeq_epi16( temp3, (const __m128i &)SIMD_SSE2_word_127 );

		temp2 = _mm_and_si128( temp2, temp4 );			
		temp2 = _mm_or_si128( temp2, temp3 );							// b0 & b1 | b3
		temp1 = _mm_xor_si128( temp1, temp4 );
		temp1 = _mm_or_si128( temp1, temp3 );							// b2 ^ b1 | b3
		temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_2 );		
		temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_1 );
		temp2 = _mm_or_si128( temp2, temp1 );


		temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
		temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
		temp5 = _mm_slli_epi32( temp5, 8 );
		temp7 = _mm_or_si128( temp7, temp5 );
		result = _mm_or_si128( temp7, temp2 );
	}

	temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
	temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
	temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
	temp4 = _mm_slli_epi32( temp4, 2 );
	temp5 = _mm_slli_epi32( temp5, 4 );
	temp6 = _mm_slli_epi32( temp6, 6 );
	temp7 = _mm_or_si128( result, temp4 );
	temp7 = _mm_or_si128( temp7, temp5 );
	temp7 = _mm_or_si128( temp7, temp6 );

	unsigned int out = _mm_cvtsi128_si32( temp7 );
	EmitUInt( out );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::EmitCoCgIndices_SSE2

params:	colorBlock	- 16 pixel block for which to find color indices
paramO:	minColor	- Min alpha found
paramO:	maxColor	- Max alpha found
return: 4 byte color index block
========================
*/
void idDxtEncoder::EmitCoCgIndices_SSE2( const byte *colorBlock, const byte *minColor_, const byte *maxColor_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	ALIGN16( byte color0[16] );
	ALIGN16( byte color1[16] );
	ALIGN16( byte color2[16] );
	ALIGN16( byte color3[16] );
	ALIGN16( byte result[16] );
	byte *outPtr = outData;

	__asm {
		mov			esi, maxColor_
		mov			edi, minColor_
		pxor		xmm7, xmm7
		movdqa		result, xmm7

		movd		xmm0, dword ptr [esi]
		pand		xmm0, SIMD_SSE2_byte_colorMask2
		pshufd		xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color0, xmm0

		movd		xmm1, dword ptr [edi]
		pand		xmm1, SIMD_SSE2_byte_colorMask2
		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color1, xmm1

		punpcklbw	xmm0, xmm7
		punpcklbw	xmm1, xmm7

		movdqa		xmm6, xmm1
		paddw		xmm1, xmm0
		paddw		xmm0, xmm1
		pmulhw		xmm0, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
		packuswb	xmm0, xmm7
		pshufd		xmm0, xmm0, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color2, xmm0

		paddw		xmm1, xmm6
		pmulhw		xmm1, SIMD_SSE2_word_div_by_3	// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
		packuswb	xmm1, xmm7
		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 1, 0, 1 )
		movdqa		color3, xmm1

		mov			eax, 32
		mov			esi, colorBlock

	loop1:			// iterates 2 times
		movq		xmm3, qword ptr [esi+eax+0]
		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm4, SIMD_SSE2_dword_0
		movq		xmm5, qword ptr [esi+eax+8]
		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )		// punpckldq	xmm5, SIMD_SSE2_dword_0

		movdqa		xmm0, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm0, color0
		psadbw		xmm6, color0
		packssdw	xmm0, xmm6
		movdqa		xmm1, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm1, color1
		psadbw		xmm6, color1
		packssdw	xmm1, xmm6
		movdqa		xmm2, xmm3
		movdqa		xmm6, xmm5
		psadbw		xmm2, color2
		psadbw		xmm6, color2
		packssdw	xmm2, xmm6
		psadbw		xmm3, color3
		psadbw		xmm5, color3
		packssdw	xmm3, xmm5

		movq		xmm4, qword ptr [esi+eax+16]
		pshufd		xmm4, xmm4, R_SHUFFLE_D( 0, 2, 1, 3 )
		movq		xmm5, qword ptr [esi+eax+24]
		pshufd		xmm5, xmm5, R_SHUFFLE_D( 0, 2, 1, 3 )

		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color0
		psadbw		xmm7, color0
		packssdw	xmm6, xmm7
		packssdw	xmm0, xmm6				// d1
		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color1
		psadbw		xmm7, color1
		packssdw	xmm6, xmm7
		packssdw	xmm1, xmm6				// d1
		movdqa		xmm6, xmm4
		movdqa		xmm7, xmm5
		psadbw		xmm6, color2
		psadbw		xmm7, color2
		packssdw	xmm6, xmm7
		packssdw	xmm2, xmm6				// d2
		psadbw		xmm4, color3
		psadbw		xmm5, color3
		packssdw	xmm4, xmm5
		packssdw	xmm3, xmm4				// d3

		movdqa		xmm7, result
		pslld		xmm7, 16

		movdqa		xmm4, xmm0
		movdqa		xmm5, xmm1
		pcmpgtw		xmm0, xmm3				// b0
		pcmpgtw		xmm1, xmm2				// b1
		pcmpgtw		xmm4, xmm2				// b2
		pcmpgtw		xmm5, xmm3				// b3
		pcmpgtw		xmm2, xmm3				// b4
		pand		xmm4, xmm1				// x0
		pand		xmm5, xmm0				// x1
		pand		xmm2, xmm0				// x2
		por			xmm4, xmm5
		pand		xmm2, SIMD_SSE2_word_1
		pand		xmm4, SIMD_SSE2_word_2
		por			xmm2, xmm4

		pshufd		xmm5, xmm2, R_SHUFFLE_D( 2, 3, 0, 1 )
		punpcklwd	xmm2, SIMD_SSE2_word_0
		punpcklwd	xmm5, SIMD_SSE2_word_0
		pslld		xmm5, 8
		por			xmm7, xmm5
		por			xmm7, xmm2
		movdqa		result, xmm7

		sub			eax, 32
		jge			loop1

		mov			esi, outPtr
		pshufd		xmm4, xmm7, R_SHUFFLE_D( 1, 2, 3, 0 )
		pshufd		xmm5, xmm7, R_SHUFFLE_D( 2, 3, 0, 1 )
		pshufd		xmm6, xmm7, R_SHUFFLE_D( 3, 0, 1, 2 )
		pslld		xmm4, 2
		pslld		xmm5, 4
		pslld		xmm6, 6
		por			xmm7, xmm4
		por			xmm7, xmm5
		por			xmm7, xmm6
		movd		dword ptr [esi], xmm7
	}

	outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128c zero = SIMD_SSE2_zero;
	__m128c result = SIMD_SSE2_zero;
	__m128c color0, color1, color2, color3;
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
	__m128c minColor = _mm_cvtsi32_si128( *(int *)minColor_ );
	__m128c maxColor = _mm_cvtsi32_si128( *(int *)maxColor_ );
	__m128c blocka[2], blockb[2];
	blocka[0] = *((__m128i *)(&colorBlock[ 0]));
	blocka[1] = *((__m128i *)(&colorBlock[32]));
	blockb[0] = *((__m128i *)(&colorBlock[16]));
	blockb[1] = *((__m128i *)(&colorBlock[48]));

	temp7 = zero;

	temp0 = maxColor;
	temp0 = _mm_and_si128( temp0, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
	color0 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp1 = minColor;
	temp1 = _mm_and_si128( temp1, *(__m128c*)SIMD_SSE2_byte_colorMask2 );
	color1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp0 = _mm_unpacklo_epi8( color0, zero );
	temp1 = _mm_unpacklo_epi8( color1, zero );

	temp6 = _mm_add_epi16( temp1, temp0 );
	temp0 = _mm_add_epi16( temp0, temp6 );
	temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_div_by_3 );		// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
	temp0 = _mm_packus_epi16( temp0, zero );
	color2 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	temp1 = _mm_add_epi16( temp1, temp6 );
	temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_div_by_3 );		// * ( ( 1 << 16 ) / 3 + 1 ) ) >> 16
	temp1 = _mm_packus_epi16( temp1, zero );
	color3 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 1, 0, 1 ) );

	for ( int i = 1; i >= 0; i-- ) {
		// Load block
		temp3 = _mm_shuffle_epi32( blocka[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
		temp5 = _mm_shuffle_ps( blocka[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );

		temp0 = _mm_sad_epu8( temp3, color0 );
		temp6 = _mm_sad_epu8( temp5, color0 );
		temp0 = _mm_packs_epi32( temp0, temp6 );

		temp1 = _mm_sad_epu8( temp3, color1 );
		temp6 = _mm_sad_epu8( temp5, color1 );
		temp1 = _mm_packs_epi32( temp1, temp6 );

		temp2 = _mm_sad_epu8( temp3, color2 );
		temp6 = _mm_sad_epu8( temp5, color2 );
		temp2 = _mm_packs_epi32( temp2, temp6 );

		temp3 = _mm_sad_epu8( temp3, color3 );
		temp5 = _mm_sad_epu8( temp5, color3 );
		temp3 = _mm_packs_epi32( temp3, temp5 );


		// Load block
		temp4 = _mm_shuffle_epi32( blockb[i], R_SHUFFLE_D( 0, 2, 1, 3 ) );
		temp5 = _mm_shuffle_ps( blockb[i], zero, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 2, 1, 3 ) );

		temp6 = _mm_sad_epu8( temp4, color0 );
		temp7 = _mm_sad_epu8( temp5, color0 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp0 = _mm_packs_epi32( temp0, temp6 );	// d0

		temp6 = _mm_sad_epu8( temp4, color1 );
		temp7 = _mm_sad_epu8( temp5, color1 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp1 = _mm_packs_epi32( temp1, temp6 );	// d1

		temp6 = _mm_sad_epu8( temp4, color2 );
		temp7 = _mm_sad_epu8( temp5, color2 );
		temp6 = _mm_packs_epi32( temp6, temp7 );
		temp2 = _mm_packs_epi32( temp2, temp6 );	// d2

		temp4 = _mm_sad_epu8( temp4, color3 );
		temp5 = _mm_sad_epu8( temp5, color3 );
		temp4 = _mm_packs_epi32( temp4, temp5 );
		temp3 = _mm_packs_epi32( temp3, temp4 );	// d3

		temp7 = _mm_slli_epi32( result, 16 );

		temp4 = _mm_cmpgt_epi16( temp0, temp2 );	// b2
		temp5 = _mm_cmpgt_epi16( temp1, temp3 );	// b3
		temp0 = _mm_cmpgt_epi16( temp0, temp3 );	// b0
		temp1 = _mm_cmpgt_epi16( temp1, temp2 );	// b1
		temp2 = _mm_cmpgt_epi16( temp2, temp3 );	// b4
		temp4 = _mm_and_si128( temp4, temp1 );		// x0
		temp5 = _mm_and_si128( temp5, temp0 );		// x1
		temp2 = _mm_and_si128( temp2, temp0 );		// x2
		temp4 = _mm_or_si128( temp4, temp5 );
		temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );
		temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_word_2 );
		temp2 = _mm_or_si128( temp2, temp4 );

		temp5 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 2, 3, 0, 1 ) );
		temp2 = _mm_unpacklo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_0 );
		temp5 = _mm_unpacklo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_0 );
		temp5 = _mm_slli_epi32( temp5, 8 );
		temp7 = _mm_or_si128( temp7, temp5 );
		result = _mm_or_si128( temp7, temp2 );
	}

	temp4 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 1, 2, 3, 0 ) );
	temp5 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 2, 3, 0, 1 ) );
	temp6 = _mm_shuffle_epi32( result, R_SHUFFLE_D( 3, 0, 1, 2 ) );
	temp4 = _mm_slli_epi32( temp4, 2 );
	temp5 = _mm_slli_epi32( temp5, 4 );
	temp6 = _mm_slli_epi32( temp6, 6 );
	temp7 = _mm_or_si128( result, temp4 );
	temp7 = _mm_or_si128( temp7, temp5 );
	temp7 = _mm_or_si128( temp7, temp6 );

	unsigned int out = _mm_cvtsi128_si32( temp7 );
	EmitUInt( out );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::EmitAlphaIndices_SSE2

params:	block		- 16 pixel block for which to find alpha indices
paramO:	minAlpha	- Min alpha found
paramO:	maxAlpha	- Max alpha found
========================
*/
void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int minAlpha_, const int maxAlpha_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	assert( maxAlpha_ >= minAlpha_ );

	byte *outPtr = outData;

	__asm {
		mov			esi, block
		movdqa		xmm0, xmmword ptr [esi+	0]
		movdqa		xmm5, xmmword ptr [esi+16]
		movdqa		xmm6, xmmword ptr [esi+32]
		movdqa		xmm4, xmmword ptr [esi+48]

		psrld		xmm0, 24
		psrld		xmm5, 24
		psrld		xmm6, 24
		psrld		xmm4, 24

		packuswb	xmm0, xmm5
		packuswb	xmm6, xmm4

		//---------------------

		// ab0 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
		// ab3 = (  9 * maxAlpha +  5 * minAlpha + ALPHA_RANGE ) / 14
		// ab2 = ( 11 * maxAlpha +  3 * minAlpha + ALPHA_RANGE ) / 14
		// ab1 = ( 13 * maxAlpha +  1 * minAlpha + ALPHA_RANGE ) / 14

		// ab4 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
		// ab5 = (  5 * maxAlpha +  9 * minAlpha + ALPHA_RANGE ) / 14
		// ab6 = (  3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
		// ab7 = (  1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14

		movd		xmm5, maxAlpha_
		pshuflw		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
		pshufd		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
		movdqa		xmm7, xmm5

		movd		xmm2, minAlpha_
		pshuflw		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
		pshufd		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
		movdqa		xmm3, xmm2

		pmullw		xmm5, SIMD_SSE2_word_scale_7_9_11_13
		pmullw		xmm7, SIMD_SSE2_word_scale_7_5_3_1
		pmullw		xmm2, SIMD_SSE2_word_scale_7_5_3_1
		pmullw		xmm3, SIMD_SSE2_word_scale_7_9_11_13

		paddw		xmm5, xmm2
		paddw		xmm7, xmm3

		paddw		xmm5, SIMD_SSE2_word_7
		paddw		xmm7, SIMD_SSE2_word_7

		pmulhw		xmm5, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16
		pmulhw		xmm7, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16

		pshufd		xmm1, xmm5,	R_SHUFFLE_D( 3, 3, 3, 3	)
		pshufd		xmm2, xmm5,	R_SHUFFLE_D( 2, 2, 2, 2	)
		pshufd		xmm3, xmm5,	R_SHUFFLE_D( 1, 1, 1, 1	)
		packuswb	xmm1, xmm1								// ab1
		packuswb	xmm2, xmm2								// ab2
		packuswb	xmm3, xmm3								// ab3

		packuswb	xmm0, xmm6								// alpha block

		pshufd		xmm4, xmm7,	R_SHUFFLE_D( 0,	0, 0, 0	)
		pshufd		xmm5, xmm7,	R_SHUFFLE_D( 1,	1, 1, 1	)
		pshufd		xmm6, xmm7,	R_SHUFFLE_D( 2,	2, 2, 2	)
		pshufd		xmm7, xmm7,	R_SHUFFLE_D( 3,	3, 3, 3	)
		packuswb	xmm4, xmm4								// ab4
		packuswb	xmm5, xmm5								// ab5
		packuswb	xmm6, xmm6								// ab6
		packuswb	xmm7, xmm7								// ab7

		pmaxub		xmm1, xmm0
		pmaxub		xmm2, xmm0
		pmaxub		xmm3, xmm0
		pcmpeqb		xmm1, xmm0
		pcmpeqb		xmm2, xmm0
		pcmpeqb		xmm3, xmm0
		pmaxub		xmm4, xmm0
		pmaxub		xmm5, xmm0
		pmaxub		xmm6, xmm0
		pmaxub		xmm7, xmm0
		pcmpeqb		xmm4, xmm0
		pcmpeqb		xmm5, xmm0
		pcmpeqb		xmm6, xmm0
		pcmpeqb		xmm7, xmm0
		movdqa		xmm0, SIMD_SSE2_byte_8
		paddsb		xmm0, xmm1
		paddsb		xmm2, xmm3
		paddsb		xmm4, xmm5
		paddsb		xmm6, xmm7
		paddsb		xmm0, xmm2
		paddsb		xmm4, xmm6
		paddsb		xmm0, xmm4
		pand		xmm0, SIMD_SSE2_byte_7
		movdqa		xmm1, SIMD_SSE2_byte_2
		pcmpgtb		xmm1, xmm0
		pand		xmm1, SIMD_SSE2_byte_1
		pxor		xmm0, xmm1
		movdqa		xmm1, xmm0
		movdqa		xmm2, xmm0
		movdqa		xmm3, xmm0
		movdqa		xmm4, xmm0
		movdqa		xmm5, xmm0
		movdqa		xmm6, xmm0
		movdqa		xmm7, xmm0
		psrlq		xmm1,  8- 3
		psrlq		xmm2, 16- 6
		psrlq		xmm3, 24- 9
		psrlq		xmm4, 32-12
		psrlq		xmm5, 40-15
		psrlq		xmm6, 48-18
		psrlq		xmm7, 56-21
		pand		xmm0, SIMD_SSE2_dword_alpha_bit_mask0
		pand		xmm1, SIMD_SSE2_dword_alpha_bit_mask1
		pand		xmm2, SIMD_SSE2_dword_alpha_bit_mask2
		pand		xmm3, SIMD_SSE2_dword_alpha_bit_mask3
		pand		xmm4, SIMD_SSE2_dword_alpha_bit_mask4
		pand		xmm5, SIMD_SSE2_dword_alpha_bit_mask5
		pand		xmm6, SIMD_SSE2_dword_alpha_bit_mask6
		pand		xmm7, SIMD_SSE2_dword_alpha_bit_mask7
		por			xmm0, xmm1
		por			xmm2, xmm3
		por			xmm4, xmm5
		por			xmm6, xmm7
		por			xmm0, xmm2
		por			xmm4, xmm6
		por			xmm0, xmm4
		mov			esi, outPtr
		movd		[esi+0], xmm0
		pshufd		xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
		movd		[esi+3], xmm1
	}

	outData += 6;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i block0 = *((__m128i *)(&block[ 0]));
	__m128i block1 = *((__m128i *)(&block[16]));
	__m128i block2 = *((__m128i *)(&block[32]));
	__m128i block3 = *((__m128i *)(&block[48]));
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

	temp0 = _mm_srli_epi32( block0, 24 );
	temp5 = _mm_srli_epi32( block1, 24 );
	temp6 = _mm_srli_epi32( block2, 24 );
	temp4 = _mm_srli_epi32( block3, 24 );

	temp0 = _mm_packus_epi16( temp0, temp5 );
	temp6 = _mm_packus_epi16( temp6, temp4 );

	//---------------------

	// ab0 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
	// ab3 = (  9 * maxAlpha +  5 * minAlpha + ALPHA_RANGE ) / 14
	// ab2 = ( 11 * maxAlpha +  3 * minAlpha + ALPHA_RANGE ) / 14
	// ab1 = ( 13 * maxAlpha +  1 * minAlpha + ALPHA_RANGE ) / 14

	// ab4 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
	// ab5 = (  5 * maxAlpha +  9 * minAlpha + ALPHA_RANGE ) / 14
	// ab6 = (  3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
	// ab7 = (  1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14

	temp5 = _mm_cvtsi32_si128( maxAlpha_ );
	temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp2 = _mm_cvtsi32_si128( minAlpha_ );
	temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
	temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
	temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
	temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );

	temp5 = _mm_add_epi16( temp5, temp2 );
	temp7 = _mm_add_epi16( temp7, temp3 );

	temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
	temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );

	temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
	temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );

	temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
	temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
	temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
	temp1 = _mm_packus_epi16( temp1, temp1 );
	temp2 = _mm_packus_epi16( temp2, temp2 );
	temp3 = _mm_packus_epi16( temp3, temp3 );

	temp0 = _mm_packus_epi16( temp0, temp6 );

	temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
	temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
	temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
	temp4 = _mm_packus_epi16( temp4, temp4 );
	temp5 = _mm_packus_epi16( temp5, temp5 );
	temp6 = _mm_packus_epi16( temp6, temp6 );
	temp7 = _mm_packus_epi16( temp7, temp7 );

	temp1 = _mm_max_epu8( temp1, temp0 );
	temp2 = _mm_max_epu8( temp2, temp0 );
	temp3 = _mm_max_epu8( temp3, temp0 );
	temp1 = _mm_cmpeq_epi8( temp1, temp0 );
	temp2 = _mm_cmpeq_epi8( temp2, temp0 );
	temp3 = _mm_cmpeq_epi8( temp3, temp0 );
	temp4 = _mm_max_epu8( temp4, temp0 );
	temp5 = _mm_max_epu8( temp5, temp0 );
	temp6 = _mm_max_epu8( temp6, temp0 );
	temp7 = _mm_max_epu8( temp7, temp0 );
	temp4 = _mm_cmpeq_epi8( temp4, temp0 );
	temp5 = _mm_cmpeq_epi8( temp5, temp0 );
	temp6 = _mm_cmpeq_epi8( temp6, temp0 );
	temp7 = _mm_cmpeq_epi8( temp7, temp0 );
	temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
	temp2 = _mm_adds_epi8( temp2, temp3 );
	temp4 = _mm_adds_epi8( temp4, temp5 );
	temp6 = _mm_adds_epi8( temp6, temp7 );
	temp0 = _mm_adds_epi8( temp0, temp2 );
	temp4 = _mm_adds_epi8( temp4, temp6 );
	temp0 = _mm_adds_epi8( temp0, temp4 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
	temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
	temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
	temp0 = _mm_xor_si128( temp0, temp1 );

	temp1 = _mm_srli_epi64( temp0,  8 -  3 );
	temp2 = _mm_srli_epi64( temp0, 16 -  6 );
	temp3 = _mm_srli_epi64( temp0, 24 -  9 );
	temp4 = _mm_srli_epi64( temp0, 32 - 12 );
	temp5 = _mm_srli_epi64( temp0, 40 - 15 );
	temp6 = _mm_srli_epi64( temp0, 48 - 18 );
	temp7 = _mm_srli_epi64( temp0, 56 - 21 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
	temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
	temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
	temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
	temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
	temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
	temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
	temp0 = _mm_or_si128( temp0, temp1 );
	temp2 = _mm_or_si128( temp2, temp3 );
	temp4 = _mm_or_si128( temp4, temp5 );
	temp6 = _mm_or_si128( temp6, temp7 );
	temp0 = _mm_or_si128( temp0, temp2 );
	temp4 = _mm_or_si128( temp4, temp6 );
	temp0 = _mm_or_si128( temp0, temp4 );


	int out = _mm_cvtsi128_si32( temp0 );
	EmitUInt( out );
	outData--;

	temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );

	out = _mm_cvtsi128_si32( temp1 );
	EmitUInt( out );
	outData--;
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::EmitAlphaIndices_SSE2
========================
*/
void idDxtEncoder::EmitAlphaIndices_SSE2( const byte *block, const int channelBitOffset, const int minAlpha_, const int maxAlpha_ ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	assert( maxAlpha_ >= minAlpha_ );

	byte *outPtr = outData;

	__asm {
		movd		xmm7, channelBitOffset

		mov			esi, block
		movdqa		xmm0, xmmword ptr [esi+	0]
		movdqa		xmm5, xmmword ptr [esi+16]
		movdqa		xmm6, xmmword ptr [esi+32]
		movdqa		xmm4, xmmword ptr [esi+48]

		psrld		xmm0, xmm7
		psrld		xmm5, xmm7
		psrld		xmm6, xmm7
		psrld		xmm4, xmm7

		pand		xmm0, SIMD_SSE2_dword_byte_mask
		pand		xmm5, SIMD_SSE2_dword_byte_mask
		pand		xmm6, SIMD_SSE2_dword_byte_mask
		pand		xmm4, SIMD_SSE2_dword_byte_mask

		packuswb	xmm0, xmm5
		packuswb	xmm6, xmm4

		//---------------------

		// ab0 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
		// ab3 = (  9 * maxAlpha +  5 * minAlpha + ALPHA_RANGE ) / 14
		// ab2 = ( 11 * maxAlpha +  3 * minAlpha + ALPHA_RANGE ) / 14
		// ab1 = ( 13 * maxAlpha +  1 * minAlpha + ALPHA_RANGE ) / 14

		// ab4 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
		// ab5 = (  5 * maxAlpha +  9 * minAlpha + ALPHA_RANGE ) / 14
		// ab6 = (  3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
		// ab7 = (  1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14

		movd		xmm5, maxAlpha_
		pshuflw		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
		pshufd		xmm5, xmm5,	R_SHUFFLE_D( 0,	0, 0, 0	)
		movdqa		xmm7, xmm5

		movd		xmm2, minAlpha_
		pshuflw		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
		pshufd		xmm2, xmm2,	R_SHUFFLE_D( 0,	0, 0, 0	)
		movdqa		xmm3, xmm2

		pmullw		xmm5, SIMD_SSE2_word_scale_7_9_11_13
		pmullw		xmm7, SIMD_SSE2_word_scale_7_5_3_1
		pmullw		xmm2, SIMD_SSE2_word_scale_7_5_3_1
		pmullw		xmm3, SIMD_SSE2_word_scale_7_9_11_13

		paddw		xmm5, xmm2
		paddw		xmm7, xmm3

		paddw		xmm5, SIMD_SSE2_word_7
		paddw		xmm7, SIMD_SSE2_word_7

		pmulhw		xmm5, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16
		pmulhw		xmm7, SIMD_SSE2_word_div_by_14			// * ( ( 1 << 16 ) / 14	+ 1	) )	>> 16

		pshufd		xmm1, xmm5,	R_SHUFFLE_D( 3, 3, 3, 3	)
		pshufd		xmm2, xmm5,	R_SHUFFLE_D( 2, 2, 2, 2	)
		pshufd		xmm3, xmm5,	R_SHUFFLE_D( 1, 1, 1, 1	)
		packuswb	xmm1, xmm1								// ab1
		packuswb	xmm2, xmm2								// ab2
		packuswb	xmm3, xmm3								// ab3

		packuswb	xmm0, xmm6								// alpha block

		pshufd		xmm4, xmm7,	R_SHUFFLE_D( 0,	0, 0, 0	)
		pshufd		xmm5, xmm7,	R_SHUFFLE_D( 1,	1, 1, 1	)
		pshufd		xmm6, xmm7,	R_SHUFFLE_D( 2,	2, 2, 2	)
		pshufd		xmm7, xmm7,	R_SHUFFLE_D( 3,	3, 3, 3	)
		packuswb	xmm4, xmm4								// ab4
		packuswb	xmm5, xmm5								// ab5
		packuswb	xmm6, xmm6								// ab6
		packuswb	xmm7, xmm7								// ab7

		pmaxub		xmm1, xmm0
		pmaxub		xmm2, xmm0
		pmaxub		xmm3, xmm0
		pcmpeqb		xmm1, xmm0
		pcmpeqb		xmm2, xmm0
		pcmpeqb		xmm3, xmm0
		pmaxub		xmm4, xmm0
		pmaxub		xmm5, xmm0
		pmaxub		xmm6, xmm0
		pmaxub		xmm7, xmm0
		pcmpeqb		xmm4, xmm0
		pcmpeqb		xmm5, xmm0
		pcmpeqb		xmm6, xmm0
		pcmpeqb		xmm7, xmm0
		movdqa		xmm0, SIMD_SSE2_byte_8
		paddsb		xmm0, xmm1
		paddsb		xmm2, xmm3
		paddsb		xmm4, xmm5
		paddsb		xmm6, xmm7
		paddsb		xmm0, xmm2
		paddsb		xmm4, xmm6
		paddsb		xmm0, xmm4
		pand		xmm0, SIMD_SSE2_byte_7
		movdqa		xmm1, SIMD_SSE2_byte_2
		pcmpgtb		xmm1, xmm0
		pand		xmm1, SIMD_SSE2_byte_1
		pxor		xmm0, xmm1
		movdqa		xmm1, xmm0
		movdqa		xmm2, xmm0
		movdqa		xmm3, xmm0
		movdqa		xmm4, xmm0
		movdqa		xmm5, xmm0
		movdqa		xmm6, xmm0
		movdqa		xmm7, xmm0
		psrlq		xmm1,  8- 3
		psrlq		xmm2, 16- 6
		psrlq		xmm3, 24- 9
		psrlq		xmm4, 32-12
		psrlq		xmm5, 40-15
		psrlq		xmm6, 48-18
		psrlq		xmm7, 56-21
		pand		xmm0, SIMD_SSE2_dword_alpha_bit_mask0
		pand		xmm1, SIMD_SSE2_dword_alpha_bit_mask1
		pand		xmm2, SIMD_SSE2_dword_alpha_bit_mask2
		pand		xmm3, SIMD_SSE2_dword_alpha_bit_mask3
		pand		xmm4, SIMD_SSE2_dword_alpha_bit_mask4
		pand		xmm5, SIMD_SSE2_dword_alpha_bit_mask5
		pand		xmm6, SIMD_SSE2_dword_alpha_bit_mask6
		pand		xmm7, SIMD_SSE2_dword_alpha_bit_mask7
		por			xmm0, xmm1
		por			xmm2, xmm3
		por			xmm4, xmm5
		por			xmm6, xmm7
		por			xmm0, xmm2
		por			xmm4, xmm6
		por			xmm0, xmm4
		mov			esi, outPtr
		movd		[esi+0], xmm0
		pshufd		xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )
		movd		[esi+3], xmm1
	}

	outData += 6;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i block0 = *((__m128i *)(&block[ 0]));
	__m128i block1 = *((__m128i *)(&block[16]));
	__m128i block2 = *((__m128i *)(&block[32]));
	__m128i block3 = *((__m128i *)(&block[48]));
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

	temp7 = _mm_cvtsi32_si128( channelBitOffset );

	temp0 = _mm_srl_epi32( block0, temp7 );
	temp5 = _mm_srl_epi32( block1, temp7 );
	temp6 = _mm_srl_epi32( block2, temp7 );
	temp4 = _mm_srl_epi32( block3, temp7 );

	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
	temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
	temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );

	temp0 = _mm_packus_epi16( temp0, temp5 );
	temp6 = _mm_packus_epi16( temp6, temp4 );

	//---------------------

	// ab0 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
	// ab3 = (  9 * maxAlpha +  5 * minAlpha + ALPHA_RANGE ) / 14
	// ab2 = ( 11 * maxAlpha +  3 * minAlpha + ALPHA_RANGE ) / 14
	// ab1 = ( 13 * maxAlpha +  1 * minAlpha + ALPHA_RANGE ) / 14

	// ab4 = (  7 * maxAlpha +  7 * minAlpha + ALPHA_RANGE ) / 14
	// ab5 = (  5 * maxAlpha +  9 * minAlpha + ALPHA_RANGE ) / 14
	// ab6 = (  3 * maxAlpha + 11 * minAlpha + ALPHA_RANGE ) / 14
	// ab7 = (  1 * maxAlpha + 13 * minAlpha + ALPHA_RANGE ) / 14

	temp5 = _mm_cvtsi32_si128( maxAlpha_ );
	temp5 = _mm_shufflelo_epi16( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp5 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp2 = _mm_cvtsi32_si128( minAlpha_ );
	temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp7 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );
	temp5 = _mm_mullo_epi16( temp5, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
	temp3 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_9_11_13 );
	temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_7_5_3_1 );

	temp5 = _mm_add_epi16( temp5, temp2 );
	temp7 = _mm_add_epi16( temp7, temp3 );

	temp5 = _mm_add_epi16( temp5, (const __m128i &)SIMD_SSE2_word_7 );
	temp7 = _mm_add_epi16( temp7, (const __m128i &)SIMD_SSE2_word_7 );

	temp5 = _mm_mulhi_epi16( temp5, (const __m128i &)SIMD_SSE2_word_div_by_14 );
	temp7 = _mm_mulhi_epi16( temp7, (const __m128i &)SIMD_SSE2_word_div_by_14 );

	temp1 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 3, 3, 3, 3 ) );
	temp2 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 2, 2, 2, 2 ) );
	temp3 = _mm_shuffle_epi32( temp5, R_SHUFFLE_D( 1, 1, 1, 1 ) );
	temp1 = _mm_packus_epi16( temp1, temp1 );
	temp2 = _mm_packus_epi16( temp2, temp2 );
	temp3 = _mm_packus_epi16( temp3, temp3 );

	temp0 = _mm_packus_epi16( temp0, temp6 );

	temp4 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp5 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 1, 1, 1, 1 ) );
	temp6 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 2, 2, 2, 2 ) );
	temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 3, 3, 3, 3 ) );
	temp4 = _mm_packus_epi16( temp4, temp4 );
	temp5 = _mm_packus_epi16( temp5, temp5 );
	temp6 = _mm_packus_epi16( temp6, temp6 );
	temp7 = _mm_packus_epi16( temp7, temp7 );

	temp1 = _mm_max_epu8( temp1, temp0 );
	temp2 = _mm_max_epu8( temp2, temp0 );
	temp3 = _mm_max_epu8( temp3, temp0 );
	temp1 = _mm_cmpeq_epi8( temp1, temp0 );
	temp2 = _mm_cmpeq_epi8( temp2, temp0 );
	temp3 = _mm_cmpeq_epi8( temp3, temp0 );
	temp4 = _mm_max_epu8( temp4, temp0 );
	temp5 = _mm_max_epu8( temp5, temp0 );
	temp6 = _mm_max_epu8( temp6, temp0 );
	temp7 = _mm_max_epu8( temp7, temp0 );
	temp4 = _mm_cmpeq_epi8( temp4, temp0 );
	temp5 = _mm_cmpeq_epi8( temp5, temp0 );
	temp6 = _mm_cmpeq_epi8( temp6, temp0 );
	temp7 = _mm_cmpeq_epi8( temp7, temp0 );
	temp0 = _mm_adds_epi8( (const __m128i &)SIMD_SSE2_byte_8, temp1 );
	temp2 = _mm_adds_epi8( temp2, temp3 );
	temp4 = _mm_adds_epi8( temp4, temp5 );
	temp6 = _mm_adds_epi8( temp6, temp7 );
	temp0 = _mm_adds_epi8( temp0, temp2 );
	temp4 = _mm_adds_epi8( temp4, temp6 );
	temp0 = _mm_adds_epi8( temp0, temp4 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_7 );
	temp1 = _mm_cmpgt_epi8( (const __m128i &)SIMD_SSE2_byte_2, temp0 );
	temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_1 );
	temp0 = _mm_xor_si128( temp0, temp1 );

	temp1 = _mm_srli_epi64( temp0,  8 -  3 );
	temp2 = _mm_srli_epi64( temp0, 16 -  6 );
	temp3 = _mm_srli_epi64( temp0, 24 -  9 );
	temp4 = _mm_srli_epi64( temp0, 32 - 12 );
	temp5 = _mm_srli_epi64( temp0, 40 - 15 );
	temp6 = _mm_srli_epi64( temp0, 48 - 18 );
	temp7 = _mm_srli_epi64( temp0, 56 - 21 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask0 );
	temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask1 );
	temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask2 );
	temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask3 );
	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask4 );
	temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask5 );
	temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask6 );
	temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_alpha_bit_mask7 );
	temp0 = _mm_or_si128( temp0, temp1 );
	temp2 = _mm_or_si128( temp2, temp3 );
	temp4 = _mm_or_si128( temp4, temp5 );
	temp6 = _mm_or_si128( temp6, temp7 );
	temp0 = _mm_or_si128( temp0, temp2 );
	temp4 = _mm_or_si128( temp4, temp6 );
	temp0 = _mm_or_si128( temp0, temp4 );


	int out = _mm_cvtsi128_si32( temp0 );
	EmitUInt( out );
	outData--;

	temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );

	out = _mm_cvtsi128_si32( temp1 );
	EmitUInt( out );
	outData--;
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::CompressImageDXT1Fast_SSE2

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT1Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;


	for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
		for ( int i = 0; i < width; i += 4 ) {
			ExtractBlock_SSE2( inBuf + i * 4, width, block );
			GetMinMaxBBox_SSE2( block, minColor, maxColor );
			InsetColorsBBox_SSE2( minColor, maxColor );

			EmitUShort( ColorTo565( maxColor ) );
			EmitUShort( ColorTo565( minColor ) );

			EmitColorIndices_SSE2( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

#ifdef TEST_COMPRESSION
	int tmpDstPadding = dstPadding;
	dstPadding = 0;
	byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
	CompressImageDXT1Fast_Generic( inBuf, testOutBuf, width, height );
	for ( int j = 0; j < height/4; j++ ) {
		for ( int i = 0; i < width/4; i++ ) {
			byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
			byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
			for ( int k = 0; k < 8; k++ ) {
				assert( ptr1[k] == ptr2[k] );
			}
		}
	}
	dstPadding = tmpDstPadding;
#endif
}

/*
========================
idDxtEncoder::CompressImageDXT1AlphaFast_SSE2

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT1AlphaFast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
		for ( int i = 0; i < width; i += 4 ) {
			ExtractBlock_SSE2( inBuf + i * 4, width, block );
			GetMinMaxBBox_SSE2( block, minColor, maxColor );
			byte minAlpha = minColor[3];
			InsetColorsBBox_SSE2( minColor, maxColor );

			if ( minAlpha >= 128 ) {
				EmitUShort( ColorTo565( maxColor ) );
				EmitUShort( ColorTo565( minColor ) );
				EmitColorIndices_SSE2( block, minColor, maxColor );
			} else {
				EmitUShort( ColorTo565( minColor ) );
				EmitUShort( ColorTo565( maxColor ) );
				EmitColorAlphaIndices_SSE2( block, minColor, maxColor );
			}
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

#ifdef TEST_COMPRESSION
	int tmpDstPadding = dstPadding;
	dstPadding = 0;
	byte * testOutBuf = (byte *) _alloca16( width * height / 2 );
	CompressImageDXT1AlphaFast_Generic( inBuf, testOutBuf, width, height );
	for ( int j = 0; j < height/4; j++ ) {
		for ( int i = 0; i < width/4; i++ ) {
			byte * ptr1 = outBuf + ( j * width/4 + i ) * 8 + j * tmpDstPadding;
			byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 8;
			for ( int k = 0; k < 8; k++ ) {
				assert( ptr1[k] == ptr2[k] );
			}
		}
	}
	dstPadding = tmpDstPadding;
#endif
}

/*
========================
idDxtEncoder::CompressImageDXT5Fast_SSE2

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressImageDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
		for ( int i = 0; i < width; i += 4 ) {
			ExtractBlock_SSE2( inBuf + i * 4, width, block );
			GetMinMaxBBox_SSE2( block, minColor, maxColor );
			InsetColorsBBox_SSE2( minColor, maxColor );

			EmitByte( maxColor[3] );
			EmitByte( minColor[3] );

			EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );

			EmitUShort( ColorTo565( maxColor ) );
			EmitUShort( ColorTo565( minColor ) );

			EmitColorIndices_SSE2( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

#ifdef TEST_COMPRESSION
	int tmpDstPadding = dstPadding;
	dstPadding = 0;
	byte * testOutBuf = (byte *) _alloca16( width * height );
	CompressImageDXT5Fast_Generic( inBuf, testOutBuf, width, height );
	for ( int j = 0; j < height / 4; j++ ) {
		for ( int i = 0; i < width / 4; i++ ) {
			byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
			byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
			for ( int k = 0; k < 16; k++ ) {
				assert( ptr1[k] == ptr2[k] );
			}
		}
	}
	dstPadding = tmpDstPadding;
#endif
}

/*
========================
idDxtEncoder::ScaleYCoCg_SSE2
========================
*/
ID_INLINE void idDxtEncoder::ScaleYCoCg_SSE2( byte *colorBlock, byte *minColor, byte *maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	__asm {
		mov			esi, colorBlock
		mov			edx, minColor
		mov			ecx, maxColor

		movd		xmm0, dword ptr [edx]
		movd		xmm1, dword ptr [ecx]

		punpcklbw	xmm0, SIMD_SSE2_byte_0
		punpcklbw	xmm1, SIMD_SSE2_byte_0

		movdqa		xmm6, SIMD_SSE2_word_center_128
		movdqa		xmm7, SIMD_SSE2_word_center_128

		psubw		xmm6, xmm0
		psubw		xmm7, xmm1

		psubw		xmm0, SIMD_SSE2_word_center_128
		psubw		xmm1, SIMD_SSE2_word_center_128

		pmaxsw		xmm6, xmm0
		pmaxsw		xmm7, xmm1

		pmaxsw		xmm6, xmm7
		pshuflw		xmm7, xmm6, R_SHUFFLE_D( 1, 0, 1, 0 )
		pmaxsw		xmm6, xmm7
		pshufd		xmm6, xmm6, R_SHUFFLE_D( 0, 0, 0, 0 )

		movdqa		xmm7, xmm6
		pcmpgtw		xmm6, SIMD_SSE2_word_63				// mask0
		pcmpgtw		xmm7, SIMD_SSE2_word_31				// mask1

		pandn		xmm7, SIMD_SSE2_byte_2
		por			xmm7, SIMD_SSE2_byte_1
		pandn		xmm6, xmm7
		movdqa		xmm3, xmm6
		movdqa		xmm7, xmm6
		pxor		xmm7, SIMD_SSE2_byte_not
		por			xmm7, SIMD_SSE2_byte_scale_mask0	// 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
		paddw		xmm6, SIMD_SSE2_byte_1
		pand		xmm6, SIMD_SSE2_byte_scale_mask1	// 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
		por			xmm6, SIMD_SSE2_byte_scale_mask2	// 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00

		movd		xmm4, dword ptr [edx]
		movd		xmm5, dword ptr [ecx]

		pand		xmm4, SIMD_SSE2_byte_scale_mask3	// 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
		pand		xmm5, SIMD_SSE2_byte_scale_mask3

		pslld		xmm3, 3
		pand		xmm3, SIMD_SSE2_byte_scale_mask4	// 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00

		por			xmm4, xmm3
		por			xmm5, xmm3

		paddb		xmm4, SIMD_SSE2_byte_minus_128_0
		paddb		xmm5, SIMD_SSE2_byte_minus_128_0

		pmullw		xmm4, xmm6
		pmullw		xmm5, xmm6

		pand		xmm4, xmm7
		pand		xmm5, xmm7

		psubb		xmm4, SIMD_SSE2_byte_minus_128_0
		psubb		xmm5, SIMD_SSE2_byte_minus_128_0

		movd		dword ptr [edx], xmm4
		movd		dword ptr [ecx], xmm5

		movdqa		xmm0, xmmword ptr [esi+ 0*4]
		movdqa		xmm1, xmmword ptr [esi+ 4*4]
		movdqa		xmm2, xmmword ptr [esi+ 8*4]
		movdqa		xmm3, xmmword ptr [esi+12*4]

		paddb		xmm0, SIMD_SSE2_byte_minus_128_0
		paddb		xmm1, SIMD_SSE2_byte_minus_128_0
		paddb		xmm2, SIMD_SSE2_byte_minus_128_0
		paddb		xmm3, SIMD_SSE2_byte_minus_128_0

		pmullw		xmm0, xmm6
		pmullw		xmm1, xmm6
		pmullw		xmm2, xmm6
		pmullw		xmm3, xmm6

		pand		xmm0, xmm7
		pand		xmm1, xmm7
		pand		xmm2, xmm7
		pand		xmm3, xmm7

		psubb		xmm0, SIMD_SSE2_byte_minus_128_0
		psubb		xmm1, SIMD_SSE2_byte_minus_128_0
		psubb		xmm2, SIMD_SSE2_byte_minus_128_0
		psubb		xmm3, SIMD_SSE2_byte_minus_128_0

		movdqa		xmmword ptr [esi+ 0*4], xmm0
		movdqa		xmmword ptr [esi+ 4*4], xmm1
		movdqa		xmmword ptr [esi+ 8*4], xmm2
		movdqa		xmmword ptr [esi+12*4], xmm3
	}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
	__m128i block1 = *((__m128i *)(&colorBlock[16]));
	__m128i block2 = *((__m128i *)(&colorBlock[32]));
	__m128i block3 = *((__m128i *)(&colorBlock[48]));

	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
	temp0 = _mm_cvtsi32_si128( *(int *)minColor );
	temp1 = _mm_cvtsi32_si128( *(int *)maxColor );

	temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
	temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );

	// TODO: Algorithm seems to be get the absolute difference
	temp6 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp0 );
	temp7 = _mm_sub_epi16( (const __m128i &)SIMD_SSE2_word_center_128, temp1 );
	temp0 = _mm_sub_epi16( temp0, (const __m128i &)SIMD_SSE2_word_center_128 );
	temp1 = _mm_sub_epi16( temp1, (const __m128i &)SIMD_SSE2_word_center_128 );
	temp6 = _mm_max_epi16( temp6, temp0 );
	temp7 = _mm_max_epi16( temp7, temp1 );

	temp6 = _mm_max_epi16( temp6, temp7 );
	temp7 = _mm_shufflelo_epi16( temp6, R_SHUFFLE_D( 1, 0, 1, 0 ) );
	temp6 = _mm_max_epi16( temp6, temp7 );
	temp6 = _mm_shuffle_epi32( temp6, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp7 = temp6;
	temp6 = _mm_cmpgt_epi16( temp6, (const __m128i &)SIMD_SSE2_word_63 );			// mask0
	temp7 = _mm_cmpgt_epi16( temp7, (const __m128i &)SIMD_SSE2_word_31 );			// mask1

	temp7 = _mm_andnot_si128( temp7, (const __m128i &)SIMD_SSE2_byte_2 );
	temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_1 );
	temp6 = _mm_andnot_si128( temp6, temp7 );
	temp3 = temp6;
	temp7 = temp6;
	temp7 = _mm_xor_si128( temp7, (const __m128i &)SIMD_SSE2_byte_not );
	temp7 = _mm_or_si128( temp7, (const __m128i &)SIMD_SSE2_byte_scale_mask0 );		// 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00
	temp6 = _mm_add_epi16( temp6, (const __m128i &)SIMD_SSE2_byte_1 );
	temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask1 );	// 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, 0xFF
	temp6 = _mm_or_si128( temp6, (const __m128i &)SIMD_SSE2_byte_scale_mask2 );		// 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00

	// TODO: remove this second store
	temp4 = _mm_cvtsi32_si128( *(int *)minColor );
	temp5 = _mm_cvtsi32_si128( *(int *)maxColor );

	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_scale_mask3 );	// 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0xFF
	temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_byte_scale_mask3 );

	temp3 = _mm_slli_epi32( temp3, 3 );
	temp3 = _mm_and_si128( temp3, (const __m128i &)SIMD_SSE2_byte_scale_mask4 );	// 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00

	temp4 = _mm_or_si128( temp4, temp3 );
	temp5 = _mm_or_si128( temp5, temp3 );

	temp4 = _mm_add_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	temp5 = _mm_add_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );

	temp4 = _mm_mullo_epi16( temp4, temp6 );
	temp5 = _mm_mullo_epi16( temp5, temp6 );

	temp4 = _mm_and_si128( temp4, temp7 );
	temp5 = _mm_and_si128( temp5, temp7 );

	temp4 = _mm_sub_epi8( temp4, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	temp5 = _mm_sub_epi8( temp5, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );

	*(int *)minColor = _mm_cvtsi128_si32( temp4 );
	*(int *)maxColor = _mm_cvtsi128_si32( temp5 );

	temp0 = _mm_add_epi8( block0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	temp1 = _mm_add_epi8( block1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	temp2 = _mm_add_epi8( block2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	temp3 = _mm_add_epi8( block3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );

	temp0 = _mm_mullo_epi16( temp0, temp6 );
	temp1 = _mm_mullo_epi16( temp1, temp6 );
	temp2 = _mm_mullo_epi16( temp2, temp6 );
	temp3 = _mm_mullo_epi16( temp3, temp6 );

	temp0 = _mm_and_si128( temp0, temp7 );
	temp1 = _mm_and_si128( temp1, temp7 );
	temp2 = _mm_and_si128( temp2, temp7 );
	temp3 = _mm_and_si128( temp3, temp7 );

	*((__m128i *)(&colorBlock[ 0])) = _mm_sub_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	*((__m128i *)(&colorBlock[16])) = _mm_sub_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	*((__m128i *)(&colorBlock[32])) = _mm_sub_epi8( temp2, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
	*((__m128i *)(&colorBlock[48])) = _mm_sub_epi8( temp3, (const __m128i &)SIMD_SSE2_byte_minus_128_0 );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::InsetYCoCgBBox_SSE2
========================
*/
ID_INLINE void idDxtEncoder::InsetYCoCgBBox_SSE2( byte *minColor, byte *maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	__asm {
		mov			esi, minColor
		mov			edi, maxColor
		movd		xmm0, dword ptr [esi]
		movd		xmm1, dword ptr [edi]
		punpcklbw	xmm0, SIMD_SSE2_byte_0
		punpcklbw	xmm1, SIMD_SSE2_byte_0
		movdqa		xmm2, xmm1
		psubw		xmm2, xmm0
		psubw		xmm2, SIMD_SSE2_word_insetYCoCgRound
		pand		xmm2, SIMD_SSE2_word_insetYCoCgMask
		pmullw		xmm0, SIMD_SSE2_word_insetYCoCgShiftUp
		pmullw		xmm1, SIMD_SSE2_word_insetYCoCgShiftUp
		paddw		xmm0, xmm2
		psubw		xmm1, xmm2
		pmulhw		xmm0, SIMD_SSE2_word_insetYCoCgShiftDown
		pmulhw		xmm1, SIMD_SSE2_word_insetYCoCgShiftDown
		pmaxsw		xmm0, SIMD_SSE2_word_0
		pmaxsw		xmm1, SIMD_SSE2_word_0
		pand		xmm0, SIMD_SSE2_word_insetYCoCgQuantMask
		pand		xmm1, SIMD_SSE2_word_insetYCoCgQuantMask
		movdqa		xmm2, xmm0
		movdqa		xmm3, xmm1
		pmulhw		xmm2, SIMD_SSE2_word_insetYCoCgRep
		pmulhw		xmm3, SIMD_SSE2_word_insetYCoCgRep
		por			xmm0, xmm2
		por			xmm1, xmm3
		packuswb	xmm0, xmm0
		packuswb	xmm1, xmm1
		movd		dword ptr [esi], xmm0
		movd		dword ptr [edi], xmm1
	}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

	temp0 = _mm_cvtsi32_si128( *(int *)minColor );
	temp1 = _mm_cvtsi32_si128( *(int *)maxColor );

	temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
	temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );

	temp2 = _mm_sub_epi16( temp1, temp0 );
	temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgRound );
	temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetYCoCgMask );
	temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
	temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftUp );
	temp0 = _mm_add_epi16( temp0, temp2 );
	temp1 = _mm_sub_epi16( temp1, temp2 );
	temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
	temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgShiftDown );
	temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
	temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
	temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgQuantMask );
	temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
	temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetYCoCgRep );
	temp0 = _mm_or_si128( temp0, temp2 );
	temp1 = _mm_or_si128( temp1, temp3 );
	temp0 = _mm_packus_epi16( temp0, temp0 );
	temp1 = _mm_packus_epi16( temp1, temp1 );

	*(int *)minColor = _mm_cvtsi128_si32( temp0 );
	*(int *)maxColor = _mm_cvtsi128_si32( temp1 );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::SelectYCoCgDiagonal_SSE2

params:	colorBlock	- 16 pixel block to find color indexes for
paramO:	minColor	- min color found
paramO:	maxColor	- max color found
return: diagonal to use
========================
*/
ID_INLINE void idDxtEncoder::SelectYCoCgDiagonal_SSE2( const byte *colorBlock, byte *minColor, byte *maxColor ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	__asm {
		mov			esi, colorBlock
		mov			edx, minColor
		mov			ecx, maxColor

		movdqa		xmm0, xmmword ptr [esi+	0]
		movdqa		xmm1, xmmword ptr [esi+16]
		movdqa		xmm2, xmmword ptr [esi+32]
		movdqa		xmm3, xmmword ptr [esi+48]

		pand		xmm0, SIMD_SSE2_dword_word_mask
		pand		xmm1, SIMD_SSE2_dword_word_mask
		pand		xmm2, SIMD_SSE2_dword_word_mask
		pand		xmm3, SIMD_SSE2_dword_word_mask

		pslldq		xmm1, 2
		pslldq		xmm3, 2
		por			xmm0, xmm1
		por			xmm2, xmm3

		movd		xmm1, dword ptr [edx]					// minColor
		movd		xmm3, dword ptr [ecx]					// maxColor

		movdqa		xmm6, xmm1
		movdqa		xmm7, xmm3

		pavgb		xmm1, xmm3
		pshuflw		xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
		movdqa		xmm3, xmm1

		pmaxub		xmm1, xmm0
		pmaxub		xmm3, xmm2
		pcmpeqb		xmm1, xmm0
		pcmpeqb		xmm3, xmm2

		movdqa		xmm0, xmm1
		movdqa		xmm2, xmm3
		psrldq		xmm0, 1
		psrldq		xmm2, 1

		pxor		xmm0, xmm1
		pxor		xmm2, xmm3
		pand		xmm0, SIMD_SSE2_word_1
		pand		xmm2, SIMD_SSE2_word_1

		paddw		xmm0, xmm2
		psadbw		xmm0, SIMD_SSE2_byte_0
		pshufd		xmm1, xmm0, R_SHUFFLE_D( 2, 3, 0, 1 )

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
		paddw		xmm1, xmm0								// side
		pcmpgtw		xmm1, SIMD_SSE2_word_8					// mask = -( side > 8 )
		pand		xmm1, SIMD_SSE2_byte_diagonalMask
		movdqa		xmm0, xmm6
		pcmpeqb		xmm0, xmm7								// mask &= -( minColor[0] != maxColor[0] )
		pslldq		xmm0, 1
		pandn		xmm0, xmm1
#else
		paddw		xmm0, xmm1								// side
		pcmpgtw		xmm0, SIMD_SSE2_word_8					// mask = -( side > 8 )
		pand		xmm0, SIMD_SSE2_byte_diagonalMask
#endif

		pxor		xmm6, xmm7
		pand		xmm0, xmm6
		pxor		xmm7, xmm0
		pxor		xmm6, xmm7

		movd		dword ptr [edx], xmm6
		movd		dword ptr [ecx], xmm7
	}
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i block0 = *((__m128i *)(&colorBlock[ 0]));
	__m128i block1 = *((__m128i *)(&colorBlock[16]));
	__m128i block2 = *((__m128i *)(&colorBlock[32]));
	__m128i block3 = *((__m128i *)(&colorBlock[48]));
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

	temp0 = _mm_and_si128( block0, (const __m128i &)SIMD_SSE2_dword_word_mask );
	temp1 = _mm_and_si128( block1, (const __m128i &)SIMD_SSE2_dword_word_mask );
	temp2 = _mm_and_si128( block2, (const __m128i &)SIMD_SSE2_dword_word_mask );
	temp3 = _mm_and_si128( block3, (const __m128i &)SIMD_SSE2_dword_word_mask );

	temp1 = _mm_slli_si128( temp1, 2 );
	temp3 = _mm_slli_si128( temp3, 2 );
	temp0 = _mm_or_si128( temp0, temp1 );
	temp2 = _mm_or_si128( temp2, temp3 );

	temp6 = _mm_cvtsi32_si128( *(int *)minColor );
	temp7 = _mm_cvtsi32_si128( *(int *)maxColor );

	temp1 = _mm_avg_epu8( temp6, temp7 );
	temp1 = _mm_shufflelo_epi16( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp3 = _mm_max_epu8( temp1, temp2 );
	temp1 = _mm_max_epu8( temp1, temp0 );
	temp1 = _mm_cmpeq_epi8( temp1, temp0 );
	temp3 = _mm_cmpeq_epi8( temp3, temp2 );

	temp0 = _mm_srli_si128( temp1, 1 );
	temp2 = _mm_srli_si128( temp3, 1 );

	temp0 = _mm_xor_si128( temp0, temp1 );
	temp2 = _mm_xor_si128( temp2, temp3 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_1 );
	temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_1 );

	temp0 = _mm_add_epi16( temp0, temp2 );
	temp0 = _mm_sad_epu8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
	temp1 = _mm_shuffle_epi32( temp0, R_SHUFFLE_D( 2, 3, 0, 1 ) );

#ifdef NVIDIA_7X_HARDWARE_BUG_FIX
	temp1 = _mm_add_epi16( temp1, temp0 );
	temp1 = _mm_cmpgt_epi16( temp1, (const __m128i &)SIMD_SSE2_word_8 );
	temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
	temp0 = _mm_cmpeq_epi8( temp6, temp7 );
	temp0 = _mm_slli_si128( temp0, 1 );
	temp0 = _mm_andnot_si128( temp0, temp1 );
#else
	temp0 = _mm_add_epi16( temp0, temp1 );
	temp0 = _mm_cmpgt_epi16( temp0, (const __m128i &)SIMD_SSE2_word_8 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_diagonalMask );
#endif

	temp6 = _mm_xor_si128( temp6, temp7 );
	temp0 = _mm_and_si128( temp0, temp6 );
	temp7 = _mm_xor_si128( temp7, temp0 );
	temp6 = _mm_xor_si128( temp6, temp7 );

	*(int *)minColor = _mm_cvtsi128_si32( temp6 );
	*(int *)maxColor = _mm_cvtsi128_si32( temp7 );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::CompressYCoCgDXT5Fast_SSE2

params:	inBuf		- image to compress
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressYCoCgDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
	ALIGN16( byte block[64] );
	ALIGN16( byte minColor[4] );
	ALIGN16( byte maxColor[4] );

	//assert( HasConstantValuePer4x4Block( inBuf, width, height, 2 ) );
	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
		for ( int i = 0; i < width; i += 4 ) {
			ExtractBlock_SSE2( inBuf + i * 4, width, block );
			GetMinMaxBBox_SSE2( block, minColor, maxColor );

			ScaleYCoCg_SSE2( block, minColor, maxColor );
			InsetYCoCgBBox_SSE2( minColor, maxColor );
			SelectYCoCgDiagonal_SSE2( block, minColor, maxColor );

			EmitByte( maxColor[3] );
			EmitByte( minColor[3] );

			EmitAlphaIndices_SSE2( block, minColor[3], maxColor[3] );

			EmitUShort( ColorTo565( maxColor ) );
			EmitUShort( ColorTo565( minColor ) );

			EmitCoCgIndices_SSE2( block, minColor, maxColor );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

#ifdef TEST_COMPRESSION
	int tmpDstPadding = dstPadding;
	dstPadding = 0;
	byte * testOutBuf = (byte *) _alloca16( width * height );
	CompressYCoCgDXT5Fast_Generic( inBuf, testOutBuf, width, height );
	for ( int j = 0; j < height / 4; j++ ) {
		for ( int i = 0; i < width / 4; i++ ) {
			byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
			byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
			for ( int k = 0; k < 16; k++ ) {
				assert( ptr1[k] == ptr2[k] );
			}
		}
	}
	dstPadding = tmpDstPadding;
#endif
}

/*
========================
idDxtEncoder::EmitGreenIndices_SSE2

params:	block		- 16-normal block for which to find normal Y indices
paramO:	minGreen	- Minimal normal Y found 
paramO:	maxGreen	- Maximal normal Y found
========================
*/
void idDxtEncoder::EmitGreenIndices_SSE2( const byte *block, const int channelBitOffset, const int minGreen, const int maxGreen ) {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	assert( maxGreen >= minGreen );

	byte *outPtr = outData;

	__asm {
		movd		xmm7, channelBitOffset

		mov			esi, block
		movdqa		xmm0, xmmword ptr [esi+	0]
		movdqa		xmm5, xmmword ptr [esi+16]
		movdqa		xmm6, xmmword ptr [esi+32]
		movdqa		xmm4, xmmword ptr [esi+48]

		psrld		xmm0, xmm7
		psrld		xmm5, xmm7
		psrld		xmm6, xmm7
		psrld		xmm4, xmm7

		pand		xmm0, SIMD_SSE2_dword_byte_mask
		pand		xmm5, SIMD_SSE2_dword_byte_mask
		pand		xmm6, SIMD_SSE2_dword_byte_mask
		pand		xmm4, SIMD_SSE2_dword_byte_mask

		packuswb	xmm0, xmm5
		packuswb	xmm6, xmm4

		//---------------------

		movd		xmm2, maxGreen
		pshuflw		xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )

		movd		xmm3, minGreen
		pshuflw		xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )

		pmullw		xmm2, SIMD_SSE2_word_scale_5_3_1
		pmullw		xmm3, SIMD_SSE2_word_scale_1_3_5
		paddw		xmm2, SIMD_SSE2_word_3
		paddw		xmm3, xmm2
		pmulhw		xmm3, SIMD_SSE2_word_div_by_6

		pshuflw		xmm1, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
		pshuflw		xmm2, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 )
		pshuflw		xmm3, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 )

		pshufd		xmm1, xmm1, R_SHUFFLE_D( 0, 0, 0, 0 )
		pshufd		xmm2, xmm2, R_SHUFFLE_D( 0, 0, 0, 0 )
		pshufd		xmm3, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )

		packuswb	xmm1, xmm1
		packuswb	xmm2, xmm2
		packuswb	xmm3, xmm3

		packuswb	xmm0, xmm6

		pmaxub		xmm1, xmm0
		pmaxub		xmm2, xmm0
		pmaxub		xmm3, xmm0
		pcmpeqb		xmm1, xmm0
		pcmpeqb		xmm2, xmm0
		pcmpeqb		xmm3, xmm0
		movdqa		xmm0, SIMD_SSE2_byte_4
		paddsb		xmm0, xmm1
		paddsb		xmm2, xmm3
		paddsb		xmm0, xmm2
		pand		xmm0, SIMD_SSE2_byte_3
		movdqa		xmm4, SIMD_SSE2_byte_2
		pcmpgtb		xmm4, xmm0
		pand		xmm4, SIMD_SSE2_byte_1
		pxor		xmm0, xmm4
		movdqa		xmm4, xmm0
		movdqa		xmm5, xmm0
		movdqa		xmm6, xmm0
		movdqa		xmm7, xmm0
		psrlq		xmm4,  8- 2
		psrlq		xmm5, 16- 4
		psrlq		xmm6, 24- 6
		psrlq		xmm7, 32- 8
		pand		xmm4, SIMD_SSE2_dword_color_bit_mask1
		pand		xmm5, SIMD_SSE2_dword_color_bit_mask2
		pand		xmm6, SIMD_SSE2_dword_color_bit_mask3
		pand		xmm7, SIMD_SSE2_dword_color_bit_mask4
		por			xmm5, xmm4
		por			xmm7, xmm6
		por			xmm7, xmm5
		movdqa		xmm4, xmm0
		movdqa		xmm5, xmm0
		movdqa		xmm6, xmm0
		psrlq		xmm4, 40-10
		psrlq		xmm5, 48-12
		psrlq		xmm6, 56-14
		pand		xmm0, SIMD_SSE2_dword_color_bit_mask0
		pand		xmm4, SIMD_SSE2_dword_color_bit_mask5
		pand		xmm5, SIMD_SSE2_dword_color_bit_mask6
		pand		xmm6, SIMD_SSE2_dword_color_bit_mask7
		por			xmm4, xmm5
		por			xmm0, xmm6
		por			xmm7, xmm4
		por			xmm7, xmm0
		mov			esi, outPtr
		pshufd		xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
		pshuflw		xmm7, xmm7, R_SHUFFLE_D( 0, 2, 1, 3 )
		movd		[esi], xmm7
	}

	outData += 4;
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i block0 = *((__m128i *)(&block[ 0]));
	__m128i block1 = *((__m128i *)(&block[16]));
	__m128i block2 = *((__m128i *)(&block[32]));
	__m128i block3 = *((__m128i *)(&block[48]));
	__m128c temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

	temp7 = _mm_cvtsi32_si128( channelBitOffset );

	temp0 = _mm_srl_epi32( block0, temp7 );
	temp5 = _mm_srl_epi32( block1, temp7 );
	temp6 = _mm_srl_epi32( block2, temp7 );
	temp4 = _mm_srl_epi32( block3, temp7 );

	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_byte_mask );
	temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_byte_mask );
	temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_byte_mask );
	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_byte_mask );

	temp0 = _mm_packus_epi16( temp0, temp5 );
	temp6 = _mm_packus_epi16( temp6, temp4 );

	//---------------------

	temp2 = _mm_cvtsi32_si128( maxGreen );
	temp2 = _mm_shufflelo_epi16( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp3 = _mm_cvtsi32_si128( minGreen );
	temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp2 = _mm_mullo_epi16( temp2, (const __m128i &)SIMD_SSE2_word_scale_5_3_1 );
	temp3 = _mm_mullo_epi16( temp3, (const __m128i &)SIMD_SSE2_word_scale_1_3_5 );
	temp2 = _mm_add_epi16( temp2, (const __m128i &)SIMD_SSE2_word_3 );
	temp3 = _mm_add_epi16( temp3, temp2 );
	temp3 = _mm_mulhi_epi16( temp3, (const __m128i &)SIMD_SSE2_word_div_by_6 );

	temp1 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp2 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 1, 1, 1, 1 ) );
	temp3 = _mm_shufflelo_epi16( temp3, R_SHUFFLE_D( 2, 2, 2, 2 ) );

	temp1 = _mm_shuffle_epi32( temp1, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp2 = _mm_shuffle_epi32( temp2, R_SHUFFLE_D( 0, 0, 0, 0 ) );
	temp3 = _mm_shuffle_epi32( temp3, R_SHUFFLE_D( 0, 0, 0, 0 ) );

	temp1 = _mm_packus_epi16( temp1, temp1 );
	temp2 = _mm_packus_epi16( temp2, temp2 );
	temp3 = _mm_packus_epi16( temp3, temp3 );

	temp0 = _mm_packus_epi16( temp0, temp6 );

	temp1 = _mm_max_epu8( temp1, temp0 );
	temp2 = _mm_max_epu8( temp2, temp0 );
	temp3 = _mm_max_epu8( temp3, temp0 );
	temp1 = _mm_cmpeq_epi8( temp1, temp0 );
	temp2 = _mm_cmpeq_epi8( temp2, temp0 );
	temp3 = _mm_cmpeq_epi8( temp3, temp0 );
	temp0 = (const __m128i &)SIMD_SSE2_byte_4;

	temp0 = _mm_adds_epi8( temp0, temp1 );
	temp2 = _mm_adds_epi8( temp2, temp3 );
	temp0 = _mm_adds_epi8( temp0, temp2 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_byte_3 );
	temp4 = (const __m128i &)SIMD_SSE2_byte_2;
	temp4 = _mm_cmpgt_epi8( temp4, temp0 );
	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_byte_1 );

	temp0 = _mm_xor_si128( temp0, temp4 );
	temp4 = _mm_srli_epi64( temp0,  8 - 2 );
	temp5 = _mm_srli_epi64( temp0, 16 - 4 );
	temp6 = _mm_srli_epi64( temp0, 24 - 6 );
	temp7 = _mm_srli_epi64( temp0, 32 - 8 );

	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask1 );
	temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask2 );
	temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask3 );
	temp7 = _mm_and_si128( temp7, (const __m128i &)SIMD_SSE2_dword_color_bit_mask4 );
	temp5 = _mm_or_si128( temp5, temp4 );
	temp7 = _mm_or_si128( temp7, temp6 );
	temp7 = _mm_or_si128( temp7, temp5 );

	temp4 = _mm_srli_epi64( temp0, 40 - 10 );
	temp5 = _mm_srli_epi64( temp0, 48 - 12 );
	temp6 = _mm_srli_epi64( temp0, 56 - 14 );
	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_dword_color_bit_mask0 );
	temp4 = _mm_and_si128( temp4, (const __m128i &)SIMD_SSE2_dword_color_bit_mask5 );
	temp5 = _mm_and_si128( temp5, (const __m128i &)SIMD_SSE2_dword_color_bit_mask6 );
	temp6 = _mm_and_si128( temp6, (const __m128i &)SIMD_SSE2_dword_color_bit_mask7 );
	temp4 = _mm_or_si128( temp4, temp5 );
	temp0 = _mm_or_si128( temp0, temp6 );
	temp7 = _mm_or_si128( temp7, temp4 );
	temp7 = _mm_or_si128( temp7, temp0 );

	temp7 = _mm_shuffle_epi32( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );
	temp7 = _mm_shufflelo_epi16( temp7, R_SHUFFLE_D( 0, 2, 1, 3 ) );

	int result = _mm_cvtsi128_si32( temp7 );
	EmitUInt( result );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::InsetNormalsBBoxDXT5_SSE2
========================
*/
void idDxtEncoder::InsetNormalsBBoxDXT5_SSE2( byte *minNormal, byte *maxNormal ) const {
#if ( defined( ID_WIN_X86_ASM ) || defined( ID_MAC_X86_ASM ) )
	__asm {
        mov         esi, minNormal
        mov         edi, maxNormal
        movd        xmm0, dword ptr [esi]							// xmm0 = minNormal
        movd        xmm1, dword ptr [edi]							// xmm1 = maxNormal
        punpcklbw   xmm0, SIMD_SSE2_byte_0
        punpcklbw   xmm1, SIMD_SSE2_byte_0
        movdqa      xmm2, xmm1
        psubw       xmm2, xmm0
        psubw       xmm2, SIMD_SSE2_word_insetNormalDXT5Round
        pand        xmm2, SIMD_SSE2_word_insetNormalDXT5Mask		// xmm2 = inset (1 & 3)

        pmullw      xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftUp
        pmullw      xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftUp
		paddw		xmm0, xmm2
		psubw		xmm1, xmm2
		pmulhw      xmm0, SIMD_SSE2_word_insetNormalDXT5ShiftDown	// xmm0 = mini
        pmulhw      xmm1, SIMD_SSE2_word_insetNormalDXT5ShiftDown	// xmm1 = maxi

		// mini and maxi must be >= 0 and <= 255
        pmaxsw      xmm0, SIMD_SSE2_word_0
        pmaxsw      xmm1, SIMD_SSE2_word_0
        pminsw      xmm0, SIMD_SSE2_word_255
        pminsw      xmm1, SIMD_SSE2_word_255

        movdqa      xmm2, xmm0
        movdqa      xmm3, xmm1
        pand        xmm0, SIMD_SSE2_word_insetNormalDXT5QuantMask
        pand        xmm1, SIMD_SSE2_word_insetNormalDXT5QuantMask
        pmulhw      xmm2, SIMD_SSE2_word_insetNormalDXT5Rep
        pmulhw      xmm3, SIMD_SSE2_word_insetNormalDXT5Rep
        por         xmm0, xmm2
        por         xmm1, xmm3
        packuswb    xmm0, xmm0
        packuswb    xmm1, xmm1
        movd        dword ptr [esi], xmm0
        movd        dword ptr [edi], xmm1
    }
#elif defined ( ID_WIN_X86_SSE2_INTRIN )
	__m128i temp0, temp1, temp2, temp3;

	temp0 = _mm_cvtsi32_si128( *(int *)minNormal );
	temp1 = _mm_cvtsi32_si128( *(int *)maxNormal );

	temp0 = _mm_unpacklo_epi8( temp0, (const __m128i &)SIMD_SSE2_byte_0 );
	temp1 = _mm_unpacklo_epi8( temp1, (const __m128i &)SIMD_SSE2_byte_0 );

	temp2 = _mm_sub_epi16( temp1, temp0 );
	temp2 = _mm_sub_epi16( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Round );
	temp2 = _mm_and_si128( temp2, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Mask );		// xmm2 = inset (1 & 3)

	temp0 = _mm_mullo_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
	temp1 = _mm_mullo_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftUp );
	temp0 = _mm_add_epi16( temp0, temp2 );
	temp1 = _mm_sub_epi16( temp1, temp2 );
	temp0 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown );	// xmm0 = mini
	temp1 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5ShiftDown );	// xmm1 = maxi

	// mini and maxi must be >= 0 and <= 255
	temp0 = _mm_max_epi16( temp0, (const __m128i &)SIMD_SSE2_word_0 );
	temp1 = _mm_max_epi16( temp1, (const __m128i &)SIMD_SSE2_word_0 );
	temp0 = _mm_min_epi16( temp0, (const __m128i &)SIMD_SSE2_word_255 );
	temp1 = _mm_min_epi16( temp1, (const __m128i &)SIMD_SSE2_word_255 );

	temp0 = _mm_and_si128( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
	temp1 = _mm_and_si128( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5QuantMask );
	temp2 = _mm_mulhi_epi16( temp0, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
	temp3 = _mm_mulhi_epi16( temp1, (const __m128i &)SIMD_SSE2_word_insetNormalDXT5Rep );
	temp0 = _mm_or_si128( temp0, temp2 );
	temp1 = _mm_or_si128( temp1, temp3 );
	temp0 = _mm_packus_epi16( temp0, temp0 );
	temp1 = _mm_packus_epi16( temp1, temp1 );

	*(int *)minNormal = _mm_cvtsi128_si32( temp0 );
	*(int *)maxNormal = _mm_cvtsi128_si32( temp1 );
#else
	assert( false );
#endif
}

/*
========================
idDxtEncoder::CompressNormalMapDXT5Fast_SSE2

params:	inBuf		- image to compress in _y_x component order
paramO:	outBuf		- result of compression
params:	width		- width of image
params:	height		- height of image
========================
*/
void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte *inBuf, byte *outBuf, int width, int height ) {
	ALIGN16( byte block[64] );
	ALIGN16( byte normal1[4] );
	ALIGN16( byte normal2[4] );

	assert( width >= 4 && ( width & 3 ) == 0 );
	assert( height >= 4 && ( height & 3 ) == 0 );

	this->width = width;
	this->height = height;
	this->outData = outBuf;

	for ( int j = 0; j < height; j += 4, inBuf += width * 4*4 ) {
		for ( int i = 0; i < width; i += 4 ) {
			ExtractBlock_SSE2( inBuf + i * 4, width, block );
			GetMinMaxBBox_SSE2( block, normal1, normal2 );
			InsetNormalsBBoxDXT5_SSE2( normal1, normal2 );

			// Write out Nx into alpha channel.
			EmitByte( normal2[3] );
			EmitByte( normal1[3] );
			EmitAlphaIndices_SSE2( block, 3*8, normal1[3], normal2[3] );

			// Write out Ny into green channel.
			EmitUShort( ColorTo565( block[0], normal2[1], block[2] ) );
			EmitUShort( ColorTo565( block[0], normal1[1], block[2] ) );
			EmitGreenIndices_SSE2( block, 1*8, normal1[1], normal2[1] );
		}
		outData += dstPadding;
		inBuf += srcPadding;
	}

#ifdef TEST_COMPRESSION
	int tmpDstPadding = dstPadding;
	dstPadding = 0;
	byte * testOutBuf = (byte *) _alloca16( width * height );
	CompressNormalMapDXT5Fast_Generic( inBuf, testOutBuf, width, height );
	for ( int j = 0; j < height / 4; j++ ) {
		for ( int i = 0; i < width / 4; i++ ) {
			byte * ptr1 = outBuf + ( j * width/4 + i ) * 16 + j * tmpDstPadding;
			byte * ptr2 = testOutBuf + ( j * width/4 + i ) * 16;
			for ( int k = 0; k < 16; k++ ) {
				assert( ptr1[k] == ptr2[k] );
			}
		}
	}
	dstPadding = tmpDstPadding;
#endif
}

#endif