tenfourfox/gfx/2d/Blur.cpp

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "Blur.h"

#include <algorithm>
#include <math.h>
#include <string.h>

#define BENCHMARK_VMX 0
#if BENCHMARK_VMX
#include <sys/time.h>
#endif

#include "mozilla/CheckedInt.h"

#include "2D.h"
#include "DataSurfaceHelpers.h"
#include "Tools.h"

#ifdef BUILD_ARM_NEON
#include "mozilla/arm.h"
#endif

using namespace std;

namespace mozilla {
namespace gfx {

/**
 * Box blur involves looking at one pixel, and setting its value to the average
 * of its neighbouring pixels.
 * @param aInput The input buffer.
 * @param aOutput The output buffer.
 * @param aLeftLobe The number of pixels to blend on the left.
 * @param aRightLobe The number of pixels to blend on the right.
 * @param aWidth The number of columns in the buffers.
 * @param aRows The number of rows in the buffers.
 * @param aSkipRect An area to skip blurring in.
 * XXX shouldn't we pass stride in separately here?
 */
static void
BoxBlurHorizontal(unsigned char* aInput,
                  unsigned char* aOutput,
                  int32_t aLeftLobe,
                  int32_t aRightLobe,
                  int32_t aWidth,
                  int32_t aRows,
                  const IntRect& aSkipRect)
{
    MOZ_ASSERT(aWidth > 0);

    int32_t boxSize = aLeftLobe + aRightLobe + 1;
    bool skipRectCoversWholeRow = 0 >= aSkipRect.x &&
                                  aWidth <= aSkipRect.XMost();
    if (boxSize == 1) {
        memcpy(aOutput, aInput, aWidth*aRows);
        return;
    }
    uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);

    for (int32_t y = 0; y < aRows; y++) {
        // Check whether the skip rect intersects this row. If the skip
        // rect covers the whole surface in this row, we can avoid
        // this row entirely (and any others along the skip rect).
        bool inSkipRectY = y >= aSkipRect.y &&
                           y < aSkipRect.YMost();
        if (inSkipRectY && skipRectCoversWholeRow) {
            y = aSkipRect.YMost() - 1;
            continue;
        }

        uint32_t alphaSum = 0;
        for (int32_t i = 0; i < boxSize; i++) {
            int32_t pos = i - aLeftLobe;
            // See assertion above; if aWidth is zero, then we would have no
            // valid position to clamp to.
            pos = max(pos, 0);
            pos = min(pos, aWidth - 1);
            alphaSum += aInput[aWidth * y + pos];
        }
        for (int32_t x = 0; x < aWidth; x++) {
            // Check whether we are within the skip rect. If so, go
            // to the next point outside the skip rect.
            if (inSkipRectY && x >= aSkipRect.x &&
                x < aSkipRect.XMost()) {
                x = aSkipRect.XMost();
                if (x >= aWidth)
                    break;

                // Recalculate the neighbouring alpha values for
                // our new point on the surface.
                alphaSum = 0;
                for (int32_t i = 0; i < boxSize; i++) {
                    int32_t pos = x + i - aLeftLobe;
                    // See assertion above; if aWidth is zero, then we would have no
                    // valid position to clamp to.
                    pos = max(pos, 0);
                    pos = min(pos, aWidth - 1);
                    alphaSum += aInput[aWidth * y + pos];
                }
            }
            int32_t tmp = x - aLeftLobe;
            int32_t last = max(tmp, 0);
            int32_t next = min(tmp + boxSize, aWidth - 1);

            aOutput[aWidth * y + x] = (uint64_t(alphaSum) * reciprocal) >> 32;

            alphaSum += aInput[aWidth * y + next] -
                        aInput[aWidth * y + last];
        }
    }
}

/**
 * Identical to BoxBlurHorizontal, except it blurs top and bottom instead of
 * left and right.
 * XXX shouldn't we pass stride in separately here?
 */
static void
BoxBlurVertical(unsigned char* aInput,
                unsigned char* aOutput,
                int32_t aTopLobe,
                int32_t aBottomLobe,
                int32_t aWidth,
                int32_t aRows,
                const IntRect& aSkipRect)
{
    MOZ_ASSERT(aRows > 0);

    int32_t boxSize = aTopLobe + aBottomLobe + 1;
    bool skipRectCoversWholeColumn = 0 >= aSkipRect.y &&
                                     aRows <= aSkipRect.YMost();
    if (boxSize == 1) {
        memcpy(aOutput, aInput, aWidth*aRows);
        return;
    }
    uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);

    for (int32_t x = 0; x < aWidth; x++) {
        bool inSkipRectX = x >= aSkipRect.x &&
                           x < aSkipRect.XMost();
        if (inSkipRectX && skipRectCoversWholeColumn) {
            x = aSkipRect.XMost() - 1;
            continue;
        }

        uint32_t alphaSum = 0;
        for (int32_t i = 0; i < boxSize; i++) {
            int32_t pos = i - aTopLobe;
            // See assertion above; if aRows is zero, then we would have no
            // valid position to clamp to.
            pos = max(pos, 0);
            pos = min(pos, aRows - 1);
            alphaSum += aInput[aWidth * pos + x];
        }
        for (int32_t y = 0; y < aRows; y++) {
            if (inSkipRectX && y >= aSkipRect.y &&
                y < aSkipRect.YMost()) {
                y = aSkipRect.YMost();
                if (y >= aRows)
                    break;

                alphaSum = 0;
                for (int32_t i = 0; i < boxSize; i++) {
                    int32_t pos = y + i - aTopLobe;
                    // See assertion above; if aRows is zero, then we would have no
                    // valid position to clamp to.
                    pos = max(pos, 0);
                    pos = min(pos, aRows - 1);
                    alphaSum += aInput[aWidth * pos + x];
                }
            }
            int32_t tmp = y - aTopLobe;
            int32_t last = max(tmp, 0);
            int32_t next = min(tmp + boxSize, aRows - 1);

            aOutput[aWidth * y + x] = (uint64_t(alphaSum) * reciprocal) >> 32;

            alphaSum += aInput[aWidth * next + x] -
                        aInput[aWidth * last + x];
        }
    }
}

static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2])
{
    int32_t major, minor, final;

    /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
     * some notes about approximating the Gaussian blur with box-blurs.
     * The comments below are in the terminology of that page.
     */
    int32_t z = aRadius / 3;
    switch (aRadius % 3) {
    case 0:
        // aRadius = z*3; choose d = 2*z + 1
        major = minor = final = z;
        break;
    case 1:
        // aRadius = z*3 + 1
        // This is a tricky case since there is no value of d which will
        // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
        // for some integer k, then the radius will be 3*k. If d is even,
        // i.e. d=2*k, then the radius will be 3*k - 1.
        // So we have to choose values that don't match the standard
        // algorithm.
        major = z + 1;
        minor = final = z;
        break;
    case 2:
        // aRadius = z*3 + 2; choose d = 2*z + 2
        major = final = z + 1;
        minor = z;
        break;
    default:
        // Mathematical impossibility!
        MOZ_ASSERT(false);
        major = minor = final = 0;
    }
    MOZ_ASSERT(major + minor + final == aRadius);

    aLobes[0][0] = major;
    aLobes[0][1] = minor;
    aLobes[1][0] = minor;
    aLobes[1][1] = major;
    aLobes[2][0] = final;
    aLobes[2][1] = final;
}

static void
SpreadHorizontal(unsigned char* aInput,
                 unsigned char* aOutput,
                 int32_t aRadius,
                 int32_t aWidth,
                 int32_t aRows,
                 int32_t aStride,
                 const IntRect& aSkipRect)
{
    if (aRadius == 0) {
        memcpy(aOutput, aInput, aStride * aRows);
        return;
    }

    bool skipRectCoversWholeRow = 0 >= aSkipRect.x &&
                                    aWidth <= aSkipRect.XMost();
    for (int32_t y = 0; y < aRows; y++) {
        // Check whether the skip rect intersects this row. If the skip
        // rect covers the whole surface in this row, we can avoid
        // this row entirely (and any others along the skip rect).
        bool inSkipRectY = y >= aSkipRect.y &&
                             y < aSkipRect.YMost();
        if (inSkipRectY && skipRectCoversWholeRow) {
            y = aSkipRect.YMost() - 1;
            continue;
        }

        for (int32_t x = 0; x < aWidth; x++) {
            // Check whether we are within the skip rect. If so, go
            // to the next point outside the skip rect.
            if (inSkipRectY && x >= aSkipRect.x &&
                x < aSkipRect.XMost()) {
                x = aSkipRect.XMost();
                if (x >= aWidth)
                    break;
            }

            int32_t sMin = max(x - aRadius, 0);
            int32_t sMax = min(x + aRadius, aWidth - 1);
            int32_t v = 0;
            for (int32_t s = sMin; s <= sMax; ++s) {
                v = max<int32_t>(v, aInput[aStride * y + s]);
            }
            aOutput[aStride * y + x] = v;
        }
    }
}

static void
SpreadVertical(unsigned char* aInput,
               unsigned char* aOutput,
               int32_t aRadius,
               int32_t aWidth,
               int32_t aRows,
               int32_t aStride,
               const IntRect& aSkipRect)
{
    if (aRadius == 0) {
        memcpy(aOutput, aInput, aStride * aRows);
        return;
    }

    bool skipRectCoversWholeColumn = 0 >= aSkipRect.y &&
                                     aRows <= aSkipRect.YMost();
    for (int32_t x = 0; x < aWidth; x++) {
        bool inSkipRectX = x >= aSkipRect.x &&
                           x < aSkipRect.XMost();
        if (inSkipRectX && skipRectCoversWholeColumn) {
            x = aSkipRect.XMost() - 1;
            continue;
        }

        for (int32_t y = 0; y < aRows; y++) {
            // Check whether we are within the skip rect. If so, go
            // to the next point outside the skip rect.
            if (inSkipRectX && y >= aSkipRect.y &&
                y < aSkipRect.YMost()) {
                y = aSkipRect.YMost();
                if (y >= aRows)
                    break;
            }

            int32_t sMin = max(y - aRadius, 0);
            int32_t sMax = min(y + aRadius, aRows - 1);
            int32_t v = 0;
            for (int32_t s = sMin; s <= sMax; ++s) {
                v = max<int32_t>(v, aInput[aStride * s + x]);
            }
            aOutput[aStride * y + x] = v;
        }
    }
}

CheckedInt<int32_t>
AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal)
{
  CheckedInt<int32_t> val(aVal);

  val += 3;
  val /= 4;
  val *= 4;

  return val;
}

AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect,
                           const IntSize& aSpreadRadius,
                           const IntSize& aBlurRadius,
                           const Rect* aDirtyRect,
                           const Rect* aSkipRect)
 : mSpreadRadius(aSpreadRadius),
   mBlurRadius(aBlurRadius),
   mSurfaceAllocationSize(0)
{
  Rect rect(aRect);
  rect.Inflate(Size(aBlurRadius + aSpreadRadius));
  rect.RoundOut();

  if (aDirtyRect) {
    // If we get passed a dirty rect from layout, we can minimize the
    // shadow size and make painting faster.
    mHasDirtyRect = true;
    mDirtyRect = *aDirtyRect;
    Rect requiredBlurArea = mDirtyRect.Intersect(rect);
    requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius));
    rect = requiredBlurArea.Intersect(rect);
  } else {
    mHasDirtyRect = false;
  }

  mRect = IntRect(int32_t(rect.x), int32_t(rect.y),
                  int32_t(rect.width), int32_t(rect.height));
  if (mRect.IsEmpty()) {
    return;
  }

  if (aSkipRect) {
    // If we get passed a skip rect, we can lower the amount of
    // blurring/spreading we need to do. We convert it to IntRect to avoid
    // expensive int<->float conversions if we were to use Rect instead.
    Rect skipRect = *aSkipRect;
    skipRect.RoundIn();
    skipRect.Deflate(Size(aBlurRadius + aSpreadRadius));
    mSkipRect = IntRect(int32_t(skipRect.x), int32_t(skipRect.y),
                        int32_t(skipRect.width), int32_t(skipRect.height));

    mSkipRect = mSkipRect.Intersect(mRect);
    if (mSkipRect.IsEqualInterior(mRect))
      return;

    mSkipRect -= mRect.TopLeft();
  } else {
    mSkipRect = IntRect(0, 0, 0, 0);
  }

  CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.width);
  if (stride.isValid()) {
    mStride = stride.value();

    // We need to leave room for an additional 3 bytes for a potential overrun
    // in our blurring code.
    size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.height, 3);
    if (size != 0) {
      mSurfaceAllocationSize = size;
    }
  }
}

AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect,
                           int32_t aStride,
                           float aSigmaX,
                           float aSigmaY)
  : mRect(int32_t(aRect.x), int32_t(aRect.y),
          int32_t(aRect.width), int32_t(aRect.height)),
    mSpreadRadius(),
    mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))),
    mStride(aStride),
    mSurfaceAllocationSize(0)
{
  IntRect intRect;
  if (aRect.ToIntRect(&intRect)) {
    size_t minDataSize = BufferSizeFromStrideAndHeight(intRect.width, intRect.height);
    if (minDataSize != 0) {
      mSurfaceAllocationSize = minDataSize;
    }
  }
}


AlphaBoxBlur::~AlphaBoxBlur()
{
}

IntSize
AlphaBoxBlur::GetSize()
{
  IntSize size(mRect.width, mRect.height);
  return size;
}

int32_t
AlphaBoxBlur::GetStride()
{
  return mStride;
}

IntRect
AlphaBoxBlur::GetRect()
{
  return mRect;
}

Rect*
AlphaBoxBlur::GetDirtyRect()
{
  if (mHasDirtyRect) {
    return &mDirtyRect;
  }

  return nullptr;
}

#if BENCHMARK_VMX
double t() {
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return (tv.tv_sec * 1000.0) + ((double)tv.tv_usec / 1000.0);
}
#endif

size_t
AlphaBoxBlur::GetSurfaceAllocationSize() const
{
  return mSurfaceAllocationSize;
}

void
AlphaBoxBlur::Blur(uint8_t* aData)
{
  if (!aData) {
    return;
  }

  // no need to do all this if not blurring or spreading
  if (mBlurRadius != IntSize(0,0) || mSpreadRadius != IntSize(0,0)) {
    int32_t stride = GetStride();

    IntSize size = GetSize();

    if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
      // No need to use CheckedInt here - we have validated it in the constructor.
      size_t szB = stride * size.height;
      unsigned char* tmpData = new (std::nothrow) uint8_t[szB];

      if (!tmpData) {
        return;
      }

      memset(tmpData, 0, szB);

      SpreadHorizontal(aData, tmpData, mSpreadRadius.width, GetSize().width, GetSize().height, stride, mSkipRect);
      SpreadVertical(tmpData, aData, mSpreadRadius.height, GetSize().width, GetSize().height, stride, mSkipRect);

      delete [] tmpData;
    }

    int32_t horizontalLobes[3][2];
    ComputeLobes(mBlurRadius.width, horizontalLobes);
    int32_t verticalLobes[3][2];
    ComputeLobes(mBlurRadius.height, verticalLobes);

    // We want to allow for some extra space on the left for alignment reasons.
    int32_t maxLeftLobe = RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();

    IntSize integralImageSize(size.width + maxLeftLobe + horizontalLobes[1][1],
                              size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);

    if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) {
      // Fallback to old blurring code when the surface is so large it may
      // overflow our integral image!

      // No need to use CheckedInt here - we have validated it in the constructor.
      size_t szB = stride * size.height;
      uint8_t* tmpData = new (std::nothrow) uint8_t[szB];
      if (!tmpData) {
        return;
      }

      memset(tmpData, 0, szB);

      uint8_t* a = aData;
      uint8_t* b = tmpData;
      if (mBlurRadius.width > 0) {
        BoxBlurHorizontal(a, b, horizontalLobes[0][0], horizontalLobes[0][1], stride, GetSize().height, mSkipRect);
        BoxBlurHorizontal(b, a, horizontalLobes[1][0], horizontalLobes[1][1], stride, GetSize().height, mSkipRect);
        BoxBlurHorizontal(a, b, horizontalLobes[2][0], horizontalLobes[2][1], stride, GetSize().height, mSkipRect);
      } else {
        a = tmpData;
        b = aData;
      }
      // The result is in 'b' here.
      if (mBlurRadius.height > 0) {
        BoxBlurVertical(b, a, verticalLobes[0][0], verticalLobes[0][1], stride, GetSize().height, mSkipRect);
        BoxBlurVertical(a, b, verticalLobes[1][0], verticalLobes[1][1], stride, GetSize().height, mSkipRect);
        BoxBlurVertical(b, a, verticalLobes[2][0], verticalLobes[2][1], stride, GetSize().height, mSkipRect);
      } else {
        a = b;
      }
      // The result is in 'a' here.
      if (a == tmpData) {
        memcpy(aData, tmpData, szB);
      }
      delete [] tmpData;
    } else {
      size_t integralImageStride = GetAlignedStride<16>(integralImageSize.width * 4);

      // We need to leave room for an additional 12 bytes for a maximum overrun
      // of 3 pixels in the blurring code.
      size_t bufLen = BufferSizeFromStrideAndHeight(integralImageStride, integralImageSize.height, 12);
      if (bufLen == 0) {
        return;
      }
      // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
      // we divide by 4.
      AlignedArray<uint32_t> integralImage((bufLen / 4) + ((bufLen % 4) ? 1 : 0));

      if (!integralImage) {
        return;
      }

#ifdef USE_SSE2
      if (Factory::HasSSE2()) {
        BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
                     verticalLobes[0][1], integralImage, integralImageStride);
        BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
                     verticalLobes[1][1], integralImage, integralImageStride);
        BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
                     verticalLobes[2][1], integralImage, integralImageStride);
      } else
#elif defined(USE_VMX)
      // It is unpossible to have USE_VMX defined in TenFourFox on G3 or debug.
      if(Factory::HasVMX()) {
#if BENCHMARK_VMX
double t0, t1, tdiff;
t0 = t();
        BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
                  verticalLobes[0][1], integralImage, integralImageStride);
        BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
                  verticalLobes[1][1], integralImage, integralImageStride);
        BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
                  verticalLobes[2][1], integralImage, integralImageStride);
t1 = t();
tdiff = t1 - t0;

t0 = t();
#endif
        BoxBlur_VMX(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
                    verticalLobes[0][1], integralImage, integralImageStride);
        BoxBlur_VMX(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
                    verticalLobes[1][1], integralImage, integralImageStride);
        BoxBlur_VMX(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
                    verticalLobes[2][1], integralImage, integralImageStride);
#if BENCHMARK_VMX
t1 = t();

printf("C/VMX: %g\n", 100 - (((t1 - t0) / tdiff) * 100));
#endif
      } else
#endif
#ifdef BUILD_ARM_NEON
      if (mozilla::supports_neon()) {
        BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
                     verticalLobes[0][1], integralImage, integralImageStride);
        BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
                     verticalLobes[1][1], integralImage, integralImageStride);
        BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
                     verticalLobes[2][1], integralImage, integralImageStride);
      } else
#endif
      {
#ifdef _MIPS_ARCH_LOONGSON3A
        BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
                     verticalLobes[0][1], integralImage, integralImageStride);
        BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
                     verticalLobes[1][1], integralImage, integralImageStride);
        BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
                     verticalLobes[2][1], integralImage, integralImageStride);
#else
        BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
                  verticalLobes[0][1], integralImage, integralImageStride);
        BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
                  verticalLobes[1][1], integralImage, integralImageStride);
        BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
                  verticalLobes[2][1], integralImage, integralImageStride);
#endif
      }
    }
  }
}

MOZ_ALWAYS_INLINE void
GenerateIntegralRow(uint32_t  *aDest, const uint8_t *aSource, uint32_t *aPreviousRow,
                    const uint32_t &aSourceWidth, const uint32_t &aLeftInflation, const uint32_t &aRightInflation)
{
  uint32_t currentRowSum = 0;
  uint32_t pixel = aSource[0];
  for (uint32_t x = 0; x < aLeftInflation; x++) {
    currentRowSum += pixel;
    *aDest++ = currentRowSum + *aPreviousRow++;
  }
  for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x += 4) {
      uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
#if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
      currentRowSum += (alphaValues >> 24) & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
      currentRowSum += (alphaValues >> 16) & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
      currentRowSum += (alphaValues >> 8) & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
      currentRowSum += alphaValues & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
#else
      currentRowSum += alphaValues & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
      alphaValues >>= 8;
      currentRowSum += alphaValues & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
      alphaValues >>= 8;
      currentRowSum += alphaValues & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
      alphaValues >>= 8;
      currentRowSum += alphaValues & 0xff;
      *aDest++ = *aPreviousRow++ + currentRowSum;
#endif
  }
  pixel = aSource[aSourceWidth - 1];
  for (uint32_t x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
    currentRowSum += pixel;
    *aDest++ = currentRowSum + *aPreviousRow++;
  }
}

MOZ_ALWAYS_INLINE void
GenerateIntegralImage_C(int32_t aLeftInflation, int32_t aRightInflation,
                        int32_t aTopInflation, int32_t aBottomInflation,
                        uint32_t *aIntegralImage, size_t aIntegralImageStride,
                        uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
{
  uint32_t stride32bit = aIntegralImageStride / 4;

  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
                            aSize.height + aTopInflation + aBottomInflation);

  memset(aIntegralImage, 0, aIntegralImageStride);

  GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage,
                      aSize.width, aLeftInflation, aRightInflation);
  for (int y = 1; y < aTopInflation + 1; y++) {
    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource, aIntegralImage + (y - 1) * stride32bit,
                        aSize.width, aLeftInflation, aRightInflation);
  }

  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + aSourceStride * (y - aTopInflation),
                        aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation);
  }

  if (aBottomInflation) {
    for (int y = (aSize.height + aTopInflation); y < integralImageSize.height; y++) {
      GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + ((aSize.height - 1) * aSourceStride),
                          aIntegralImage + (y - 1) * stride32bit,
                          aSize.width, aLeftInflation, aRightInflation);
    }
  }
}

/**
 * Attempt to do an in-place box blur using an integral image.
 */
void
AlphaBoxBlur::BoxBlur_C(uint8_t* aData,
                        int32_t aLeftLobe,
                        int32_t aRightLobe,
                        int32_t aTopLobe,
                        int32_t aBottomLobe,
                        uint32_t *aIntegralImage,
                        size_t aIntegralImageStride)
{
  IntSize size = GetSize();

  MOZ_ASSERT(size.width > 0);

  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
  // looking at an integral image the value of a pixel at 'x,y' is calculated
  // using the value of the integral image values above/below that.
  aLeftLobe++;
  aTopLobe++;
  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);

  MOZ_ASSERT(boxSize > 0);

  if (boxSize == 1) {
      return;
  }

  int32_t stride32bit = aIntegralImageStride / 4;

  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();

  GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
                          aIntegralImage, aIntegralImageStride, aData,
                          mStride, size);

  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);

  uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;

  // Storing these locally makes this about 30% faster! Presumably the compiler
  // can't be sure we're not altering the member variables in this loop.
  IntRect skipRect = mSkipRect;
  uint8_t *data = aData;
  int32_t stride = mStride;
  for (int32_t y = 0; y < size.height; y++) {
    bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();

    uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
    uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
    uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
    uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);

    for (int32_t x = 0; x < size.width; x++) {
      if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
        x = skipRect.XMost() - 1;
        // Trigger early jump on coming loop iterations, this will be reset
        // next line anyway.
        inSkipRectY = false;
        continue;
      }
      int32_t topLeft = topLeftBase[x];
      int32_t topRight = topRightBase[x];
      int32_t bottomRight = bottomRightBase[x];
      int32_t bottomLeft = bottomLeftBase[x];

      uint32_t value = bottomRight - topRight - bottomLeft;
      value += topLeft;

      data[stride * y + x] = (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32;
    }
  }
}

/**
 * Compute the box blur size (which we're calling the blur radius) from
 * the standard deviation.
 *
 * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
 * approximating a Gaussian using box blurs.  This yields quite a good
 * approximation for a Gaussian.  Then we multiply this by 1.5 since our
 * code wants the radius of the entire triple-box-blur kernel instead of
 * the diameter of an individual box blur.  For more details, see:
 *   http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
 *   https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
 */
static const Float GAUSSIAN_SCALE_FACTOR = Float((3 * sqrt(2 * M_PI) / 4) * 1.5);

IntSize
AlphaBoxBlur::CalculateBlurRadius(const Point& aStd)
{
    IntSize size(static_cast<int32_t>(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)),
                 static_cast<int32_t>(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f)));

    return size;
}

} // namespace gfx
} // namespace mozilla