32bpp_sse_func.hpp

Go to the documentation of this file.
00001 /* $Id: 32bpp_sse_func.hpp 26334 2014-02-11 21:17:43Z frosch $ */
00002 
00003 /*
00004  * This file is part of OpenTTD.
00005  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
00006  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00007  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
00008  */
00009 
00012 #ifndef BLITTER_32BPP_SSE_FUNC_HPP
00013 #define BLITTER_32BPP_SSE_FUNC_HPP
00014 
00015 #ifdef WITH_SSE
00016 
00017 static inline void InsertFirstUint32(const uint32 value, __m128i &into)
00018 {
00019 #if (SSE_VERSION >= 4)
00020   into = _mm_insert_epi32(into, value, 0);
00021 #else
00022   into = _mm_insert_epi16(into, value, 0);
00023   into = _mm_insert_epi16(into, value >> 16, 1);
00024 #endif
00025 }
00026 
00027 static inline void InsertSecondUint32(const uint32 value, __m128i &into)
00028 {
00029 #if (SSE_VERSION >= 4)
00030   into = _mm_insert_epi32(into, value, 1);
00031 #else
00032   into = _mm_insert_epi16(into, value, 2);
00033   into = _mm_insert_epi16(into, value >> 16, 3);
00034 #endif
00035 }
00036 
00037 static inline void LoadUint64(const uint64 value, __m128i &into)
00038 {
00039 #ifdef _SQ64
00040   into = _mm_cvtsi64_si128(value);
00041 #else
00042   #if (SSE_VERSION >= 4)
00043     into = _mm_cvtsi32_si128(value);
00044     InsertSecondUint32(value >> 32, into);
00045   #else
00046     (*(um128i*) &into).m128i_u64[0] = value;
00047   #endif
00048 #endif
00049 }
00050 
00051 static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask)
00052 {
00053 #if (SSE_VERSION == 2)
00054   from = _mm_and_si128(from, mask);    // PAND, wipe high bytes to keep low bytes when packing
00055   return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation)
00056 #else
00057   return _mm_shuffle_epi8(from, mask);
00058 #endif
00059 }
00060 
00061 static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask)
00062 {
00063 #if (SSE_VERSION == 2)
00064   __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1
00065   return _mm_shufflehi_epi16(alphaAB, 0x3F);         // PSHUFHW, put alpha2 in front of each rgb2
00066 #else
00067   return _mm_shuffle_epi8(from, mask);
00068 #endif
00069 }
00070 
00071 static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask)
00072 {
00073   __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());   // PUNPCKLBW, expand each uint8 into uint16
00074   __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
00075 
00076   __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++;
00077   alphaAB = _mm_srli_epi16(alphaAB, 15);
00078   alphaAB = _mm_add_epi16(alphaAB, srcAB);
00079   alphaAB = DistributeAlpha(alphaAB, distribution_mask);
00080 
00081   srcAB = _mm_sub_epi16(srcAB, dstAB);     // PSUBW,    (r - Cr)
00082   srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr)
00083   srcAB = _mm_srli_epi16(srcAB, 8);        // PSRLW,  a*(r - Cr)/256
00084   srcAB = _mm_add_epi16(srcAB, dstAB);     // PADDW,  a*(r - Cr)/256 + Cr
00085   return PackUnsaturated(srcAB, pack_mask);
00086 }
00087 
00088 /* Darken 2 pixels.
00089  * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
00090  */
00091 static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base)
00092 {
00093   __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
00094   __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
00095   __m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
00096   alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
00097   __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
00098   dstAB = _mm_mullo_epi16(dstAB, nom);
00099   dstAB = _mm_srli_epi16(dstAB, 8);
00100   return _mm_packus_epi16(dstAB, dstAB);
00101 }
00102 
00103 IGNORE_UNINITIALIZED_WARNING_START
00104 static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness)
00105 {
00106   uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
00107   c16 *= brightness;
00108   uint64 c16_ob = c16; // Helps out of order execution.
00109   c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS;
00110   c16 &= 0x01FF01FF01FFULL;
00111 
00112   /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
00113   c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001ULL) * 0xFF) & c16;
00114   const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
00115 
00116   const uint32 alpha32 = colour.data & 0xFF000000;
00117   __m128i ret;
00118   LoadUint64(c16, ret);
00119   if (ob != 0) {
00120     __m128i ob128 = _mm_cvtsi32_si128(ob);
00121     ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
00122     __m128i white = OVERBRIGHT_VALUE_MASK;
00123     __m128i c128 = ret;
00124     ret = _mm_subs_epu16(white, c128); // PSUBUSW,   (255 - rgb)
00125     ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb)
00126     ret = _mm_srli_epi16(ret, 8);      // PSRLW,  ob*(255 - rgb)/256
00127     ret = _mm_add_epi16(ret, c128);    // PADDW,  ob*(255 - rgb)/256 + rgb
00128   }
00129 
00130   ret = _mm_packus_epi16(ret, ret);      // PACKUSWB, saturate and pack.
00131   return alpha32 | _mm_cvtsi128_si32(ret);
00132 }
00133 IGNORE_UNINITIALIZED_WARNING_STOP
00134 
00138 static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness)
00139 {
00140   /* Shortcut for normal brightness. */
00141   if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour;
00142 
00143   return ReallyAdjustBrightness(colour, brightness);
00144 }
00145 
00146 static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness)
00147 {
00148 #if (SSE_VERSION < 3)
00149   NOT_REACHED();
00150 #else
00151   /* The following dataflow differs from the one of AdjustBrightness() only for alpha.
00152    * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
00153    * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
00154    */
00155   brightness &= 0xFF00FF00;
00156   brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS;
00157 
00158   __m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
00159   __m128i briAB = _mm_cvtsi32_si128(brightness);
00160   briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2.
00161   colAB = _mm_mullo_epi16(colAB, briAB);
00162   __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7);
00163   colAB = _mm_srli_epi16(colAB, 7);
00164 
00165   /* Sum overbright.
00166    * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
00167    * -255 is changed in -256 so we just have to take the 8 lower bits into account.
00168    */
00169   colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
00170   colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
00171   colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
00172   colAB_ob = _mm_and_si128(colAB_ob, colAB);
00173   __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());
00174 
00175   obAB = _mm_srli_epi16(obAB, 1);        // Reduce overbright strength.
00176   obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
00177   __m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white.
00178   retAB = _mm_subs_epu16(retAB, colAB);  //    (255 - rgb)
00179   retAB = _mm_mullo_epi16(retAB, obAB);  // ob*(255 - rgb)
00180   retAB = _mm_srli_epi16(retAB, 8);      // ob*(255 - rgb)/256
00181   retAB = _mm_add_epi16(retAB, colAB);   // ob*(255 - rgb)/256 + rgb
00182 
00183   return _mm_packus_epi16(retAB, retAB);
00184 #endif
00185 }
00186 
00187 #if FULL_ANIMATION == 0
00188 
00195 IGNORE_UNINITIALIZED_WARNING_START
00196 template <BlitterMode mode, Blitter_32bppSSE2::ReadMode read_mode, Blitter_32bppSSE2::BlockType bt_last, bool translucent>
00197 #if (SSE_VERSION == 2)
00198 inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom)
00199 #elif (SSE_VERSION == 3)
00200 inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom)
00201 #elif (SSE_VERSION == 4)
00202 inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom)
00203 #endif
00204 {
00205   const byte * const remap = bp->remap;
00206   Colour *dst_line = (Colour *) bp->dst + bp->top * bp->pitch + bp->left;
00207   int effective_width = bp->width;
00208 
00209   /* Find where to start reading in the source sprite. */
00210   const SpriteData * const sd = (const SpriteData *) bp->sprite;
00211   const SpriteInfo * const si = &sd->infos[zoom];
00212   const MapValue *src_mv_line = (const MapValue *) &sd->data[si->mv_offset] + bp->skip_top * si->sprite_width;
00213   const Colour *src_rgba_line = (const Colour *) ((const byte *) &sd->data[si->sprite_offset] + bp->skip_top * si->sprite_line_size);
00214 
00215   if (read_mode != RM_WITH_MARGIN) {
00216     src_rgba_line += bp->skip_left;
00217     src_mv_line += bp->skip_left;
00218   }
00219   const MapValue *src_mv = src_mv_line;
00220 
00221   /* Load these variables into register before loop. */
00222 #if (SSE_VERSION == 2)
00223   const __m128i clear_hi    = CLEAR_HIGH_BYTE_MASK;
00224   #define ALPHA_BLEND_PARAM_1 clear_hi
00225   #define ALPHA_BLEND_PARAM_2 clear_hi
00226   #define DARKEN_PARAM_1      tr_nom_base
00227   #define DARKEN_PARAM_2      tr_nom_base
00228 #else
00229   const __m128i a_cm        = ALPHA_CONTROL_MASK;
00230   const __m128i pack_low_cm = PACK_LOW_CONTROL_MASK;
00231   #define ALPHA_BLEND_PARAM_1 a_cm
00232   #define ALPHA_BLEND_PARAM_2 pack_low_cm
00233   #define DARKEN_PARAM_1      a_cm
00234   #define DARKEN_PARAM_2      tr_nom_base
00235 #endif
00236   const __m128i tr_nom_base = TRANSPARENT_NOM_BASE;
00237 
00238   for (int y = bp->height; y != 0; y--) {
00239     Colour *dst = dst_line;
00240     const Colour *src = src_rgba_line + META_LENGTH;
00241     if (mode == BM_COLOUR_REMAP) src_mv = src_mv_line;
00242 
00243     if (read_mode == RM_WITH_MARGIN) {
00244       assert(bt_last == BT_NONE); // or you must ensure block type is preserved
00245       src += src_rgba_line[0].data;
00246       dst += src_rgba_line[0].data;
00247       if (mode == BM_COLOUR_REMAP) src_mv += src_rgba_line[0].data;
00248       const int width_diff = si->sprite_width - bp->width;
00249       effective_width = bp->width - (int) src_rgba_line[0].data;
00250       const int delta_diff = (int) src_rgba_line[1].data - width_diff;
00251       const int new_width = effective_width - delta_diff;
00252       effective_width = delta_diff > 0 ? new_width : effective_width;
00253       if (effective_width <= 0) goto next_line;
00254     }
00255 
00256     switch (mode) {
00257       default:
00258         if (!translucent) {
00259           for (uint x = (uint) effective_width; x > 0; x--) {
00260             if (src->a) *dst = *src;
00261             src++;
00262             dst++;
00263           }
00264           break;
00265         }
00266 
00267         for (uint x = (uint) effective_width / 2; x > 0; x--) {
00268           __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
00269           __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
00270           _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2));
00271           src += 2;
00272           dst += 2;
00273         }
00274 
00275         if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
00276           __m128i srcABCD = _mm_cvtsi32_si128(src->data);
00277           __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
00278           dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2));
00279         }
00280         break;
00281 
00282       case BM_COLOUR_REMAP:
00283 #if (SSE_VERSION >= 3)
00284         for (uint x = (uint) effective_width / 2; x > 0; x--) {
00285           __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
00286           __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
00287           uint32 mvX2 = *((uint32 *) const_cast<MapValue *>(src_mv));
00288 
00289           /* Remap colours. */
00290           if (mvX2 & 0x00FF00FF) {
00291             #define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \
00292               /* Written so the compiler uses CMOV. */ \
00293               Colour m_colour = m_colour_init; \
00294               { \
00295               const Colour srcm = (Colour) (m_src); \
00296               const uint m = (byte) (m_m); \
00297               const uint r = remap[m]; \
00298               const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
00299               m_colour = r == 0 ? m_colour : cmap; \
00300               m_colour = m != 0 ? m_colour : srcm; \
00301               }
00302 #ifdef _SQ64
00303             uint64 srcs = _mm_cvtsi128_si64(srcABCD);
00304             uint64 remapped_src = 0;
00305             CMOV_REMAP(c0, 0, srcs, mvX2);
00306             remapped_src = c0.data;
00307             CMOV_REMAP(c1, 0, srcs >> 32, mvX2 >> 16);
00308             remapped_src |= (uint64) c1.data << 32;
00309             srcABCD = _mm_cvtsi64_si128(remapped_src);
00310 #else
00311             Colour remapped_src[2];
00312             CMOV_REMAP(c0, 0, _mm_cvtsi128_si32(srcABCD), mvX2);
00313             remapped_src[0] = c0.data;
00314             CMOV_REMAP(c1, 0, src[1], mvX2 >> 16);
00315             remapped_src[1] = c1.data;
00316             srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
00317 #endif
00318 
00319             if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
00320           }
00321 
00322           /* Blend colours. */
00323           _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2));
00324           dst += 2;
00325           src += 2;
00326           src_mv += 2;
00327         }
00328 
00329         if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
00330 #else
00331         for (uint x = (uint) effective_width; x > 0; x--) {
00332 #endif
00333           /* In case the m-channel is zero, do not remap this pixel in any way. */
00334           __m128i srcABCD;
00335           if (src_mv->m) {
00336             const uint r = remap[src_mv->m];
00337             if (r != 0) {
00338               Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
00339               if (src->a == 255) {
00340                 *dst = remapped_colour;
00341               } else {
00342                 remapped_colour.a = src->a;
00343                 srcABCD = _mm_cvtsi32_si128(remapped_colour.data);
00344                 goto bmcr_alpha_blend_single;
00345               }
00346             }
00347           } else {
00348             srcABCD = _mm_cvtsi32_si128(src->data);
00349             if (src->a < 255) {
00350 bmcr_alpha_blend_single:
00351               __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
00352               srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, ALPHA_BLEND_PARAM_1, ALPHA_BLEND_PARAM_2);
00353             }
00354             dst->data = _mm_cvtsi128_si32(srcABCD);
00355           }
00356 #if (SSE_VERSION == 2)
00357           src_mv++;
00358           dst++;
00359           src++;
00360 #endif
00361         }
00362         break;
00363 
00364       case BM_TRANSPARENT:
00365         /* Make the current colour a bit more black, so it looks like this image is transparent. */
00366         for (uint x = (uint) bp->width / 2; x > 0; x--) {
00367           __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
00368           __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
00369           _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
00370           src += 2;
00371           dst += 2;
00372         }
00373 
00374         if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
00375           __m128i srcABCD = _mm_cvtsi32_si128(src->data);
00376           __m128i dstABCD = _mm_cvtsi32_si128(dst->data);
00377           dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, DARKEN_PARAM_1, DARKEN_PARAM_2));
00378         }
00379         break;
00380     }
00381 
00382 next_line:
00383     if (mode == BM_COLOUR_REMAP) src_mv_line += si->sprite_width;
00384     src_rgba_line = (const Colour*) ((const byte*) src_rgba_line + si->sprite_line_size);
00385     dst_line += bp->pitch;
00386   }
00387 }
00388 IGNORE_UNINITIALIZED_WARNING_STOP
00389 
00397 #if (SSE_VERSION == 2)
00398 void Blitter_32bppSSE2::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom)
00399 #elif (SSE_VERSION == 3)
00400 void Blitter_32bppSSSE3::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom)
00401 #elif (SSE_VERSION == 4)
00402 void Blitter_32bppSSE4::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom)
00403 #endif
00404 {
00405   switch (mode) {
00406     default: {
00407       if (bp->skip_left != 0 || bp->width <= MARGIN_NORMAL_THRESHOLD) {
00408 bm_normal:
00409         const BlockType bt_last = (BlockType) (bp->width & 1);
00410         switch (bt_last) {
00411           default:     Draw<BM_NORMAL, RM_WITH_SKIP, BT_EVEN, true>(bp, zoom); return;
00412           case BT_ODD: Draw<BM_NORMAL, RM_WITH_SKIP, BT_ODD, true>(bp, zoom); return;
00413         }
00414       } else {
00415         if (((const Blitter_32bppSSE_Base::SpriteData *) bp->sprite)->flags & SF_TRANSLUCENT) {
00416           Draw<BM_NORMAL, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom);
00417         } else {
00418           Draw<BM_NORMAL, RM_WITH_MARGIN, BT_NONE, false>(bp, zoom);
00419         }
00420         return;
00421       }
00422       break;
00423     }
00424     case BM_COLOUR_REMAP:
00425       if (((const Blitter_32bppSSE_Base::SpriteData *) bp->sprite)->flags & SF_NO_REMAP) goto bm_normal;
00426       if (bp->skip_left != 0 || bp->width <= MARGIN_REMAP_THRESHOLD) {
00427         Draw<BM_COLOUR_REMAP, RM_WITH_SKIP, BT_NONE, true>(bp, zoom); return;
00428       } else {
00429         Draw<BM_COLOUR_REMAP, RM_WITH_MARGIN, BT_NONE, true>(bp, zoom); return;
00430       }
00431     case BM_TRANSPARENT:  Draw<BM_TRANSPARENT, RM_NONE, BT_NONE, true>(bp, zoom); return;
00432   }
00433 }
00434 #endif /* FULL_ANIMATION */
00435 
00436 #endif /* WITH_SSE */
00437 #endif /* BLITTER_32BPP_SSE_FUNC_HPP */