/*
 * Copyright (c) 2017-2020 Software Architecture Group, Hasso Plattner Institute
 *
 * Licensed under the MIT License.
 */
package de.hpi.swa.trufflesqueak.nodes.plugins;

import java.util.function.LongBinaryOperator;

import com.oracle.truffle.api.CompilerDirectives;
import com.oracle.truffle.api.CompilerDirectives.CompilationFinal;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;

import de.hpi.swa.trufflesqueak.exceptions.PrimitiveExceptions.PrimitiveFailed;
import de.hpi.swa.trufflesqueak.exceptions.SqueakExceptions.SqueakException;
import de.hpi.swa.trufflesqueak.image.SqueakImageContext;
import de.hpi.swa.trufflesqueak.model.AbstractPointersObject;
import de.hpi.swa.trufflesqueak.model.AbstractSqueakObject;
import de.hpi.swa.trufflesqueak.model.FloatObject;
import de.hpi.swa.trufflesqueak.model.NativeObject;
import de.hpi.swa.trufflesqueak.model.NilObject;
import de.hpi.swa.trufflesqueak.model.PointersObject;
import de.hpi.swa.trufflesqueak.model.VariablePointersObject;
import de.hpi.swa.trufflesqueak.model.layout.ObjectLayouts.FORM;
import de.hpi.swa.trufflesqueak.nodes.SqueakGuards;
import de.hpi.swa.trufflesqueak.util.MiscUtils;
import de.hpi.swa.trufflesqueak.util.UnsafeUtils;
import sun.misc.Unsafe;

/* Automatically generated by
   SmartSyntaxPluginCodeGenerator * VMMaker.oscog-eem.2480 uuid: bb3ffda7-8241-4dea-b886-d656e474b6c1
    from
   BitBltSimulation * VMMaker.oscog-eem.2480 uuid: bb3ffda7-8241-4dea-b886-d656e474b6c1
 */

public final class BitBlt {

    private final SqueakImageContext image;

    /* Constants */
    private static final long ALL_ONES = 0xFFFFFFFFL;
    private static final int ALPHA_INDEX = 3;
    private static final int BB_CLIP_HEIGHT_INDEX = 13;
    private static final int BB_CLIP_WIDTH_INDEX = 12;
    private static final int BB_CLIP_X_INDEX = 10;
    private static final int BB_CLIP_Y_INDEX = 11;
    private static final int BB_COLOR_MAP_INDEX = 14;
    private static final int BB_DEST_FORM_INDEX = 0;
    private static final int BB_DEST_X_INDEX = 4;
    private static final int BB_DEST_Y_INDEX = 5;
    private static final int BB_HALFTONE_FORM_INDEX = 2;
    private static final int BB_HEIGHT_INDEX = 7;
    private static final int BB_RULE_INDEX = 3;
    private static final int BB_SOURCE_FORM_INDEX = 1;
    private static final int BB_SOURCE_X_INDEX = 8;
    private static final int BB_SOURCE_Y_INDEX = 9;
    private static final int BB_WARP_BASE = 15;
    private static final int BB_WIDTH_INDEX = 6;
    // private static final int BE_BITBLT_INDEX = 2;
    private static final int BINARY_POINT = 14;
    private static final int BLUE_INDEX = 2;
    private static final int COLOR_MAP_FIXED_PART = 2;
    private static final int COLOR_MAP_INDEXED_PART = 4;
    private static final int COLOR_MAP_NEW_STYLE = 8;
    private static final int COLOR_MAP_PRESENT = 1;
    private static final int FIXED_PT1 = 0x4000;
    private static final int GREEN_INDEX = 1;
    private static final int OP_TABLE_SIZE = 43;
    private static final int RED_INDEX = 0;

    /* Return the default translation table from 1..8 bit indexed colors to 32bit */
    /* The table has been generated by the following statements */
    /*
     * | pvs hex | String streamContents:[:s| s nextPutAll:'static long theTable[256] = { '. pvs :=
     * (Color colorMapIfNeededFrom: 8 to: 32) asArray. 1 to: pvs size do:[:i| i > 1 ifTrue:[s
     * nextPutAll:', ']. (i-1 \\ 8) = 0 ifTrue:[s cr]. s nextPutAll:'0x'. hex := (pvs at: i)
     * printStringBase: 16. s nextPutAll: (hex copyFrom: 4 to: hex size). ]. s nextPutAll:'};'. ].
     */

    /* BitBltSimulation>>#default8To32Table */
    @CompilationFinal(dimensions = 1) private static final long[] DEFAULT_8_TO_32_TABLE = new long[]{
                    0x0L, 0xFF000001L, 0xFFFFFFFFL, 0xFF808080L, 0xFFFF0000L, 0xFF00FF00L, 0xFF0000FFL, 0xFF00FFFFL,
                    0xFFFFFF00L, 0xFFFF00FFL, 0xFF202020L, 0xFF404040L, 0xFF606060L, 0xFF9F9F9FL, 0xFFBFBFBFL, 0xFFDFDFDFL,
                    0xFF080808L, 0xFF101010L, 0xFF181818L, 0xFF282828L, 0xFF303030L, 0xFF383838L, 0xFF484848L, 0xFF505050L,
                    0xFF585858L, 0xFF686868L, 0xFF707070L, 0xFF787878L, 0xFF878787L, 0xFF8F8F8FL, 0xFF979797L, 0xFFA7A7A7L,
                    0xFFAFAFAFL, 0xFFB7B7B7L, 0xFFC7C7C7L, 0xFFCFCFCFL, 0xFFD7D7D7L, 0xFFE7E7E7L, 0xFFEFEFEFL, 0xFFF7F7F7L,
                    0xFF000001L, 0xFF003300L, 0xFF006600L, 0xFF009900L, 0xFF00CC00L, 0xFF00FF00L, 0xFF000033L, 0xFF003333L,
                    0xFF006633L, 0xFF009933L, 0xFF00CC33L, 0xFF00FF33L, 0xFF000066L, 0xFF003366L, 0xFF006666L, 0xFF009966L,
                    0xFF00CC66L, 0xFF00FF66L, 0xFF000099L, 0xFF003399L, 0xFF006699L, 0xFF009999L, 0xFF00CC99L, 0xFF00FF99L,
                    0xFF0000CCL, 0xFF0033CCL, 0xFF0066CCL, 0xFF0099CCL, 0xFF00CCCCL, 0xFF00FFCCL, 0xFF0000FFL, 0xFF0033FFL,
                    0xFF0066FFL, 0xFF0099FFL, 0xFF00CCFFL, 0xFF00FFFFL, 0xFF330000L, 0xFF333300L, 0xFF336600L, 0xFF339900L,
                    0xFF33CC00L, 0xFF33FF00L, 0xFF330033L, 0xFF333333L, 0xFF336633L, 0xFF339933L, 0xFF33CC33L, 0xFF33FF33L,
                    0xFF330066L, 0xFF333366L, 0xFF336666L, 0xFF339966L, 0xFF33CC66L, 0xFF33FF66L, 0xFF330099L, 0xFF333399L,
                    0xFF336699L, 0xFF339999L, 0xFF33CC99L, 0xFF33FF99L, 0xFF3300CCL, 0xFF3333CCL, 0xFF3366CCL, 0xFF3399CCL,
                    0xFF33CCCCL, 0xFF33FFCCL, 0xFF3300FFL, 0xFF3333FFL, 0xFF3366FFL, 0xFF3399FFL, 0xFF33CCFFL, 0xFF33FFFFL,
                    0xFF660000L, 0xFF663300L, 0xFF666600L, 0xFF669900L, 0xFF66CC00L, 0xFF66FF00L, 0xFF660033L, 0xFF663333L,
                    0xFF666633L, 0xFF669933L, 0xFF66CC33L, 0xFF66FF33L, 0xFF660066L, 0xFF663366L, 0xFF666666L, 0xFF669966L,
                    0xFF66CC66L, 0xFF66FF66L, 0xFF660099L, 0xFF663399L, 0xFF666699L, 0xFF669999L, 0xFF66CC99L, 0xFF66FF99L,
                    0xFF6600CCL, 0xFF6633CCL, 0xFF6666CCL, 0xFF6699CCL, 0xFF66CCCCL, 0xFF66FFCCL, 0xFF6600FFL, 0xFF6633FFL,
                    0xFF6666FFL, 0xFF6699FFL, 0xFF66CCFFL, 0xFF66FFFFL, 0xFF990000L, 0xFF993300L, 0xFF996600L, 0xFF999900L,
                    0xFF99CC00L, 0xFF99FF00L, 0xFF990033L, 0xFF993333L, 0xFF996633L, 0xFF999933L, 0xFF99CC33L, 0xFF99FF33L,
                    0xFF990066L, 0xFF993366L, 0xFF996666L, 0xFF999966L, 0xFF99CC66L, 0xFF99FF66L, 0xFF990099L, 0xFF993399L,
                    0xFF996699L, 0xFF999999L, 0xFF99CC99L, 0xFF99FF99L, 0xFF9900CCL, 0xFF9933CCL, 0xFF9966CCL, 0xFF9999CCL,
                    0xFF99CCCCL, 0xFF99FFCCL, 0xFF9900FFL, 0xFF9933FFL, 0xFF9966FFL, 0xFF9999FFL, 0xFF99CCFFL, 0xFF99FFFFL,
                    0xFFCC0000L, 0xFFCC3300L, 0xFFCC6600L, 0xFFCC9900L, 0xFFCCCC00L, 0xFFCCFF00L, 0xFFCC0033L, 0xFFCC3333L,
                    0xFFCC6633L, 0xFFCC9933L, 0xFFCCCC33L, 0xFFCCFF33L, 0xFFCC0066L, 0xFFCC3366L, 0xFFCC6666L, 0xFFCC9966L,
                    0xFFCCCC66L, 0xFFCCFF66L, 0xFFCC0099L, 0xFFCC3399L, 0xFFCC6699L, 0xFFCC9999L, 0xFFCCCC99L, 0xFFCCFF99L,
                    0xFFCC00CCL, 0xFFCC33CCL, 0xFFCC66CCL, 0xFFCC99CCL, 0xFFCCCCCCL, 0xFFCCFFCCL, 0xFFCC00FFL, 0xFFCC33FFL,
                    0xFFCC66FFL, 0xFFCC99FFL, 0xFFCCCCFFL, 0xFFCCFFFFL, 0xFFFF0000L, 0xFFFF3300L, 0xFFFF6600L, 0xFFFF9900L,
                    0xFFFFCC00L, 0xFFFFFF00L, 0xFFFF0033L, 0xFFFF3333L, 0xFFFF6633L, 0xFFFF9933L, 0xFFFFCC33L, 0xFFFFFF33L,
                    0xFFFF0066L, 0xFFFF3366L, 0xFFFF6666L, 0xFFFF9966L, 0xFFFFCC66L, 0xFFFFFF66L, 0xFFFF0099L, 0xFFFF3399L,
                    0xFFFF6699L, 0xFFFF9999L, 0xFFFFCC99L, 0xFFFFFF99L, 0xFFFF00CCL, 0xFFFF33CCL, 0xFFFF66CCL, 0xFFFF99CCL,
                    0xFFFFCCCCL, 0xFFFFFFCCL, 0xFFFF00FFL, 0xFFFF33FFL, 0xFFFF66FFL, 0xFFFF99FFL, 0xFFFFCCFFL, 0xFFFFFFFFL};

    /* Variables */
    private int affectedB;
    private int affectedL;
    private int affectedR;
    private int affectedT;
    private int bbH;
    private int bbW;
    private PointersObject bitBltOop;
    private long bitCount;
    private int clipHeight;
    private int clipWidth;
    private int clipX;
    private int clipY;
    private long cmBitsPerColor;
    private long cmFlags;
    private int[] cmLookupTable;
    private long cmMask;

    /** Used in {@link BitBlt#setupColorMasksFromto}. */
    private final int[] cmMaskTableTemplate = new int[]{0, 0, 0, 0};
    private final int[] cmShiftTableTemplate = new int[]{0, 0, 0, 0};

    private int[] cmMaskTable;
    private int[] cmShiftTable;
    private int combinationRule;
    private long componentAlphaModeAlpha;
    private long componentAlphaModeColor;
    private Object destBits;
    private int destBitsBaseOffset;
    private int destBitsIndexScale;
    private long destDelta;
    private int destDepth;
    private PointersObject destForm;
    private int destHeight;
    private long destIndex;
    private long destMask;
    private boolean destMSB;
    private int destPitch;
    private int destPPW;
    private int destWidth;
    private int destX;
    private int destY;
    @CompilationFinal(dimensions = 1) private static final int[] DITHER_8_LOOKUP = new int[4096];
    @CompilationFinal(dimensions = 1) private static final int[] DITHER_MATRIX_4X4 = new int[]{
                    0, 8, 2, 10,
                    12, 4, 14, 6,
                    3, 11, 1, 9,
                    15, 7, 13, 5
    };
    @CompilationFinal(dimensions = 1) private static final int[] DITHER_THRESHOLDS_16 = new int[]{0, 2, 4, 6, 8, 12, 14, 16};
    @CompilationFinal(dimensions = 1) private static final int[] DITHER_VALUES_16 = new int[]{
                    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
    };
    private long dstBitShift;
    private int dx;
    private int dy;
    private long endOfDestination;
    private long endOfSource;
    private long[] gammaLookupTable;
    private AbstractSqueakObject halftoneForm;
    private int[] halftoneBits;
    private long halftoneHeight;
    private long hDir;
    private int height;
    @SuppressWarnings("unused") private boolean isWarping;
    private long mask1;
    private long mask2;
    @CompilationFinal(dimensions = 1) private static final int[] MASK_TABLE = new int[]{
                    0, 1, 3, 0, 15, 31, 0, 0, 255, 0, 0, 0, 0, 0, 0, 0, 65535,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1
    };
    private static final String MODULE_NAME = "BitBltPlugin * VMMaker.oscog-eem.2480 (TruffleSqueak)";
    private boolean noHalftone;
    private boolean noSource;
    private int nWords;
    @CompilationFinal(dimensions = 1) private final LongBinaryOperator[] opTable = new LongBinaryOperator[43];
    private boolean preload;
    private long skew;
    private long sourceAlpha;
    private Object sourceBits;
    private int sourceBitsBaseOffset;
    private int sourceBitsIndexScale;
    private long sourceDelta;
    private int sourceDepth;
    private PointersObject sourceForm;
    private int sourceHeight;
    private long sourceIndex;
    private boolean sourceMSB;
    private int sourcePitch;
    private int sourcePPW;
    private int sourceWidth;
    private int sourceX;
    private int sourceY;
    private long srcBitShift;
    private int sx;
    private int sy;
    private long[] ungammaLookupTable;
    private long vDir;
    private long warpAlignMask;
    private long warpAlignShift;
    private int[] warpBitShiftTable = new int[32];
    private long warpSrcMask;
    private long warpSrcShift;
    private int width;

    private boolean successFlag = false;

    public BitBlt(final SqueakImageContext image) {
        this.image = image;
        initialiseModule();
    }

    /* BitBltSimulation>>#addWord:with: */
    private static long addWordwith(final long sourceWord, final long destinationWord) {
        return sourceWord + destinationWord;
    }

    /* BitBltSimulation>>#alphaBlendConst:with: */
    private long alphaBlendConstwith(final long sourceWord, final long destinationWord) {
        return alphaBlendConstwithpaintMode(sourceWord, destinationWord, false);
    }

    /*
     * Blend sourceWord with destinationWord using a constant alpha. Alpha is encoded as 0 meaning
     * 0.0, and 255 meaning 1.0. The blend produced is alpha*source + (1.0-alpha)*dest, with the
     * computation being performed independently on each color component. This function could
     * eventually blend into any depth destination, using the same color averaging and mapping as
     * warpBlt. paintMode = true means do nothing if the source pixel value is zero.
     */
    /*
     * This first implementation works with dest depths of 16 and 32 bits only. Normal color mapping
     * will allow sources of lower depths in this case, and results can be mapped directly by
     * truncation, so no extra color maps are needed. To allow storing into any depth will require
     * subsequent addition of two other colormaps, as is the case with WarpBlt.
     */

    /* BitBltSimulation>>#alphaBlendConst:with:paintMode: */
    private long alphaBlendConstwithpaintMode(final long sourceWord, final long destinationWord, final boolean paintMode) {
        if (destDepth < 16) {
            return destinationWord;
        }
        final long unAlpha = 0xFF - sourceAlpha;
        long result = destinationWord;
        if (destPPW == 1) {
            /* 32bpp blends include alpha */
            if (!(paintMode && sourceWord == 0)) {
                /* painting a transparent pixel */
                /* blendRB red and blue */
                long blendRB = (sourceWord & 16711935) * sourceAlpha + (destinationWord & 16711935) * unAlpha + 16711935;
                /* blendRB alpha and green */
                long blendAG = (sourceWord >>> 8 & 16711935) * sourceAlpha + (destinationWord >>> 8 & 16711935) * unAlpha + 16711935;
                /* divide by 255 */
                blendRB = blendRB + (blendRB - 65537 >>> 8 & 16711935) >>> 8 & 16711935;
                blendAG = blendAG + (blendAG - 65537 >>> 8 & 16711935) >>> 8 & 16711935;
                result = blendRB | blendAG << 8;
            }
        } else {
            final long pixMask = MASK_TABLE[destDepth];
            final long bitsPerColor = 5;
            final long rgbMask = 0x1F;
            long maskShifted = destMask;
            long destShifted = destinationWord;
            long sourceShifted = sourceWord;
            for (int j = 1; j <= destPPW; j++) {
                final long sourcePixVal = sourceShifted & pixMask;
                if (!((maskShifted & pixMask) == 0 || paintMode && sourcePixVal == 0)) {
                    final long destPixVal = destShifted & pixMask;
                    long pixBlend = 0;
                    for (int i = 1; i <= 3; i++) {
                        final long shift = (i - 1) * bitsPerColor;
                        final long blend = div((shr(sourcePixVal, shift) & rgbMask) * sourceAlpha + (shr(destPixVal, shift) & rgbMask) * unAlpha + 0xFE, 0xFF) & rgbMask;
                        pixBlend = pixBlend | shl(blend, shift);
                    }
                    result = result & ~shl(pixMask, (j - 1) * 16) | shl(pixBlend, (j - 1) * 16);
                }
                maskShifted = shr(maskShifted, destDepth);
                sourceShifted = shr(sourceShifted, destDepth);
                destShifted = shr(destShifted, destDepth);
            }
        }
        return result;
    }

    /*
     * Blend sourceWord with destinationWord using the alpha value from sourceWord. Alpha is encoded
     * as 0 meaning 0.0, and 255 meaning 1.0. In contrast to alphaBlend:with: the color produced is
     *
     * srcColor + (1-srcAlpha) * dstColor
     *
     * e.g., it is assumed that the source color is already scaled.
     */

    /* BitBltSimulation>>#alphaBlendScaled:with: */
    private long alphaBlendScaledwith(final long sourceWord, final long destinationWord) {
        /* High 8 bits of source pixel is source opacity (ARGB format) */
        final long unAlpha = 0xFF - (sourceWord >>> 24);
        /* blend red and blue components */
        long rb = ((destinationWord & 16711935) * unAlpha >>> 8 & 16711935) + (sourceWord & 16711935);
        /* blend alpha and green components */
        long ag = ((destinationWord >>> 8 & 16711935) * unAlpha >>> 8 & 16711935) + (sourceWord >>> 8 & 16711935);
        /* saturate red and blue components if there is a carry */
        rb = rb & 16711935 | (rb & 16777472) * 0xFF >>> 8;
        /* saturate alpha and green components if there is a carry */
        ag = (int) ((ag & 16711935) << 8) | (ag & 16777472) * 0xFF;
        return ag | rb;
    }

    /*
     * Blend sourceWord with destinationWord, assuming both are 32-bit pixels. The source is assumed
     * to have 255*alpha in the high 8 bits of each pixel, while the high 8 bits of the
     * destinationWord will be ignored. The blend produced is alpha*source + (1-alpha)*dest, with
     * the computation being performed independently on each color component. The high byte of the
     * result will be 0.
     */

    /* BitBltSimulation>>#alphaBlend:with: */
    private long alphaBlendwith(final long sourceWord, final long destinationWord) {
        /* High 8 bits of source pixel */
        final long alpha = sourceWord >>> 24;
        if (alpha == 0) {
            return destinationWord;
        }
        if (alpha == 0xFF) {
            return sourceWord;
        }
        final long unAlpha = 0xFF - alpha;
        /* blend red and blue */
        long blendRB = (sourceWord & 16711935) * alpha + (destinationWord & 16711935) * unAlpha + 16711935;
        /* blend alpha and green */
        long blendAG = ((sourceWord >>> 8 | 0xFF0000) & 16711935) * alpha + (destinationWord >>> 8 & 16711935) * unAlpha + 16711935;
        /* divide by 255 */
        blendRB = blendRB + (blendRB - 65537 >>> 8 & 16711935) >>> 8 & 16711935;
        blendAG = blendAG + (blendAG - 65537 >>> 8 & 16711935) >>> 8 & 16711935;
        return blendRB | blendAG << 8;
    }

    /* BitBltSimulation>>#alphaPaintConst:with: */
    private long alphaPaintConstwith(final long sourceWord, final long destinationWord) {
        if (sourceWord == 0) {
            return destinationWord;
        }
        return alphaBlendConstwithpaintMode(sourceWord, destinationWord, true);
    }

    /*
     * This version assumes combinationRule = 34 sourcePixSize = 32 destPixSize = 16 sourceForm ~=
     * destForm.
     */

    /* BitBltSimulation>>#alphaSourceBlendBits16 */
    private void alphaSourceBlendBits16() {
        /* This particular method should be optimized in itself */

        /* So we can pre-decrement */
        int deltaY = bbH + 1;
        int srcY = sy;
        int dstY = dy;
        int srcShift = (dx & 1) * 16;
        if (destMSB) {
            srcShift = 16 - srcShift;
        }
        /* This is the outer loop */
        mask1 = shl(0xFFFF, 16 - srcShift);
        while (--deltaY > 0) {
            long srcIndex = srcY * sourcePitch + sx * 4;
            long dstIndex = dstY * destPitch + dx / 2 * 4;
            final int ditherBase = (dstY & 3) * 4;
            /* For pre-increment */
            int ditherIndex = (sx & 3) - 1;
            /* So we can pre-decrement */
            int deltaX = bbW + 1;
            long dstMask = mask1;
            if (dstMask == 0xFFFF) {
                srcShift = 16;
            } else {
                srcShift = 0;
            }
            while (--deltaX > 0) {
                final int ditherThreshold = DITHER_MATRIX_4X4[ditherBase + (ditherIndex = ditherIndex + 1 & 3)];
                long sourceWord = srcLongAt(srcIndex);
                final long srcAlpha = sourceWord >>> 24;
                if (srcAlpha == 0xFF) {
                    /* Dither from 32 to 16 bit */
                    sourceWord = dither32To16threshold(sourceWord, ditherThreshold);
                    if (sourceWord == 0) {
                        sourceWord = shl(1, srcShift);
                    } else {
                        sourceWord = shl(sourceWord, srcShift);
                    }
                    destLongAtputmask(dstIndex, dstMask, sourceWord);
                } else {
                    /* srcAlpha ~= 255 */
                    if (srcAlpha != 0) {
                        /* 0 < srcAlpha < 255 */
                        /* If we have to mix colors then just copy a single word */
                        long destWord = dstLongAt(dstIndex);
                        destWord = destWord & ~dstMask;
                        /* Expand from 16 to 32 bit by adding zero bits */
                        destWord = destWord >>> srcShift;
                        /* Mix colors */
                        destWord = (destWord & 0x7C00) << 9 | (destWord & 0x3E0) << 6 | (destWord & 0x1F) << 3 | 0xFF000000L;
                        /* And dither */
                        sourceWord = alphaBlendScaledwith(sourceWord, destWord);
                        sourceWord = dither32To16threshold(sourceWord, ditherThreshold);
                        if (sourceWord == 0) {
                            sourceWord = shl(1, srcShift);
                        } else {
                            sourceWord = shl(sourceWord, srcShift);
                        }
                        destLongAtputmask(dstIndex, dstMask, sourceWord);
                    }
                }
                srcIndex += 4;
                if (destMSB) {
                    if (srcShift == 0) {
                        dstIndex += 4;
                    }
                } else {
                    if (srcShift != 0) {
                        dstIndex += 4;
                    }
                }
                /* Toggle between 0 and 16 */
                srcShift = srcShift ^ 16;
                dstMask = ~dstMask;
            }
            srcY++;
            dstY++;
        }
    }

    /*
     * This version assumes combinationRule = 34 sourcePixSize = destPixSize = 32 sourceForm ~=
     * destForm. Note: The inner loop has been optimized for dealing with the special cases of
     * srcAlpha = 0.0 and srcAlpha = 1.0
     */

    /* BitBltSimulation>>#alphaSourceBlendBits32 */
    private void alphaSourceBlendBits32() {
        /* This particular method should be optimized in itself */
        /* Give the compile a couple of hints */
        /*
         * The following should be declared as pointers so the compiler will notice that they're
         * used for accessing memory locations (good to know on an Intel architecture) but then the
         * increments would be different between ST code and C code so must hope the compiler
         * notices what happens (MS Visual C does)
         */

        /* So we can pre-decrement */
        int deltaY = bbH + 1;
        int srcY = sy;
        /* This is the outer loop */
        int dstY = dy;
        while (--deltaY > 0) {
            long srcIndex = srcY * sourcePitch + sx * 4;
            long dstIndex = dstY * destPitch + dx * 4;
            /* So we can pre-decrement */
            /* This is the inner loop */
            int deltaX = bbW + 1;
            while (--deltaX > 0) {
                long sourceWord = srcLongAt(srcIndex);
                final long srcAlpha = sourceWord >>> 24;
                if (srcAlpha == 0xFF) {
                    dstLongAtput(dstIndex, sourceWord);
                    srcIndex += 4;
                    /* Now copy as many words as possible with alpha = 255 */
                    dstIndex += 4;
                    while (--deltaX > 0 && (sourceWord = srcLongAt(srcIndex)) >>> 24 == 0xFF) {
                        dstLongAtput(dstIndex, sourceWord);
                        srcIndex += 4;
                        dstIndex += 4;
                    }
                    deltaX++;
                } else {
                    /* srcAlpha ~= 255 */
                    if (srcAlpha == 0) {
                        srcIndex += 4;
                        /* Now skip as many words as possible, */
                        dstIndex += 4;
                        while (--deltaX != 0 && (sourceWord = srcLongAt(srcIndex)) >>> 24 == 0) {
                            srcIndex += 4;
                            dstIndex += 4;
                        }
                        deltaX++;
                    } else {
                        /* 0 < srcAlpha < 255 */
                        /* If we have to mix colors then just copy a single word */
                        final long destWord = alphaBlendScaledwith(sourceWord, dstLongAt(dstIndex));
                        dstLongAtput(dstIndex, destWord);
                        srcIndex += 4;
                        dstIndex += 4;
                    }
                }
            }
            srcY++;
            dstY++;
        }
    }

    /*
     * This version assumes combinationRule = 34 sourcePixSize = 32 destPixSize = 8 sourceForm ~=
     * destForm. Note: This is not real blending since we don't have the source colors available.
     */

    /* BitBltSimulation>>#alphaSourceBlendBits8 */
    private void alphaSourceBlendBits8() {
        final long[] mappingTable = DEFAULT_8_TO_32_TABLE;
        final long mapperFlags = cmFlags & ~COLOR_MAP_NEW_STYLE;

        /* So we can pre-decrement */
        int deltaY = bbH + 1;
        int srcY = sy;
        int dstY = dy;
        mask1 = (dx & 3) * 8;
        if (destMSB) {
            mask1 = 24 - mask1;
        }
        mask2 = ALL_ONES ^ shl(0xFF, mask1);
        long adjust;
        if ((dx & 1) == 0) {
            adjust = 0;
        } else {
            adjust = 522133279;
        }
        if ((dy & 1) == 0) {
            adjust = adjust ^ 522133279;
        }
        while (--deltaY > 0) {
            adjust = adjust ^ 522133279;
            long srcIndex = srcY * sourcePitch + sx * 4;
            long dstIndex = dstY * destPitch + dx / 4 * 4;
            /* So we can pre-decrement */
            int deltaX = bbW + 1;
            long srcShift = mask1;
            /* This is the inner loop */
            long dstMask = mask2;
            while (--deltaX > 0) {
                long sourceWord = (srcLongAt(srcIndex) & ~adjust) + adjust;
                final long srcAlpha = sourceWord >>> 24;
                if (srcAlpha > 0x1F) {
                    /* Everything below 31 is transparent */
                    if (srcAlpha < 224) {
                        /* Everything above 224 is opaque */
                        long destWord = dstLongAt(dstIndex);
                        destWord = destWord & ~dstMask;
                        destWord = shr(destWord, srcShift);
                        destWord = mappingTable[(int) destWord];
                        sourceWord = alphaBlendScaledwith(sourceWord, destWord);
                    }
                    sourceWord = mapPixelflags(sourceWord, mapperFlags);
                    /* Store back */
                    sourceWord = shl(sourceWord, srcShift);
                    destLongAtputmask(dstIndex, dstMask, sourceWord);
                }
                srcIndex += 4;
                if (destMSB) {
                    if (srcShift == 0) {
                        dstIndex += 4;
                        srcShift = 24;
                        dstMask = 0xFFFFFF;
                    } else {
                        srcShift -= 8;
                        dstMask = dstMask >>> 8 | 0xFF000000L;
                    }
                } else {
                    if (srcShift == 24) {
                        dstIndex += 4;
                        srcShift = 0;
                        dstMask = 0xFFFFFF00L;
                    } else {
                        srcShift += 8;
                        dstMask = dstMask << 8 | 0xFF;
                    }
                }
                adjust = adjust ^ 522133279;
            }
            srcY++;
            dstY++;
        }
    }

    /* BitBltSimulation>>#bitAndInvert:with: */
    private static long bitAndInvertwith(final long sourceWord, final long destinationWord) {
        return sourceWord & ~destinationWord;
    }

    /* BitBltSimulation>>#bitAnd:with: */
    private static long bitAndwith(final long sourceWord, final long destinationWord) {
        return sourceWord & destinationWord;
    }

    /* BitBltSimulation>>#bitInvertAndInvert:with: */
    private static long bitInvertAndInvertwith(final long sourceWord, final long destinationWord) {
        return ~sourceWord & ~destinationWord;
    }

    /* BitBltSimulation>>#bitInvertAnd:with: */
    private static long bitInvertAndwith(final long sourceWord, final long destinationWord) {
        return ~sourceWord & destinationWord;
    }

    /* BitBltSimulation>>#bitInvertDestination:with: */
    private static long bitInvertDestinationwith(@SuppressWarnings("unused") final long sourceWord, final long destinationWord) {
        return ~destinationWord;
    }

    /* BitBltSimulation>>#bitInvertOrInvert:with: */
    private static long bitInvertOrInvertwith(final long sourceWord, final long destinationWord) {
        return ~sourceWord | ~destinationWord;
    }

    /* BitBltSimulation>>#bitInvertOr:with: */
    private static long bitInvertOrwith(final long sourceWord, final long destinationWord) {
        return ~sourceWord | destinationWord;
    }

    /* BitBltSimulation>>#bitInvertSource:with: */
    private static long bitInvertSourcewith(final long sourceWord, @SuppressWarnings("unused") final long destinationWord) {
        return ~sourceWord;
    }

    /* BitBltSimulation>>#bitInvertXor:with: */
    private static long bitInvertXorwith(final long sourceWord, final long destinationWord) {
        return ~sourceWord ^ destinationWord;
    }

    /* BitBltSimulation>>#bitOrInvert:with: */
    private static long bitOrInvertwith(final long sourceWord, final long destinationWord) {
        return sourceWord | ~destinationWord;
    }

    /* BitBltSimulation>>#bitOr:with: */
    private static long bitOrwith(final long sourceWord, final long destinationWord) {
        return sourceWord | destinationWord;
    }

    /* BitBltSimulation>>#bitXor:with: */
    private static long bitXorwith(final long sourceWord, final long destinationWord) {
        return sourceWord ^ destinationWord;
    }

    /* check for possible overlap of source and destination */
    /* ar 10/19/1999: This method requires surfaces to be locked. */

    /* BitBltSimulation>>#checkSourceOverlap */
    private void checkSourceOverlap() {
        if (sourceForm == destForm && dy >= sy) {
            if (dy > sy) {
                /* have to start at bottom */
                vDir = -1;
                sy = sy + bbH - 1;
                dy = dy + bbH - 1;
            } else {
                if (dy == sy && dx > sx) {
                    /* y's are equal, but x's are backward */
                    hDir = -1;
                    /* start at right */
                    sx = sx + bbW - 1;
                    /* and fix up masks */
                    dx = dx + bbW - 1;
                    if (nWords > 1) {
                        final long t = mask1;
                        mask1 = mask2;
                        mask2 = t;
                    }
                }
            }
            destIndex = dy * destPitch + div(dx, destPPW) * 4;
            destDelta = destPitch * vDir - 4 * (nWords * hDir);
        }
    }

    /* BitBltSimulation>>#clearWord:with: */
    private static long clearWordwith(@SuppressWarnings("unused") final long sourceWord, @SuppressWarnings("unused") final long destinationWord) {
        return 0L;
    }

    /* clip and adjust source origin and extent appropriately */
    /* first in x */

    /* BitBltSimulation>>#clipRange */
    private void clipRange() {
        if (destX >= clipX) {
            sx = sourceX;
            dx = destX;
            bbW = width;
        } else {
            sx = sourceX + clipX - destX;
            bbW = width - (clipX - destX);
            dx = clipX;
        }
        if (dx + bbW > clipX + clipWidth) {
            bbW -= dx + bbW - (clipX + clipWidth);
        }
        if (destY >= clipY) {
            sy = sourceY;
            dy = destY;
            bbH = height;
        } else {
            sy = sourceY + clipY - destY;
            bbH = height - (clipY - destY);
            dy = clipY;
        }
        if (dy + bbH > clipY + clipHeight) {
            bbH -= dy + bbH - (clipY + clipHeight);
        }
        if (noSource) {
            return;
        }
        if (sx < 0) {
            dx -= sx;
            bbW += sx;
            sx = 0;
        }
        if (sx + bbW > sourceWidth) {
            bbW -= sx + bbW - sourceWidth;
        }
        if (sy < 0) {
            dy -= sy;
            bbH += sy;
            sy = 0;
        }
        if (sy + bbH > sourceHeight) {
            bbH -= sy + bbH - sourceHeight;
        }
    }

    /* This function is exported for the Balloon engine */

    /* BitBltSimulation>>#copyBits */
    private void copyBits() {
        copyBits(-1);
    }

    private void copyBits(final long factor) {
        clipRange();
        if (bbW <= 0 || bbH <= 0) {
            /* zero width or height; noop */
            affectedL = affectedR = affectedT = affectedB = 0;
            return;
        }
        if (!lockSurfaces()) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        copyBitsLockedAndClipped(factor);
        unlockSurfaces();
    }

    /* Support for the balloon engine. */

    /* BitBltSimulation>>#copyBitsFrom:to:at: */
    protected void copyBitsFromtoat(final int startX, final int stopX, final int yValue) {
        destX = startX;
        destY = yValue;
        sourceX = startX;
        width = stopX - startX;
        copyBits();
        showDisplayBits();
    }

    /*
     * Perform the actual copyBits operation. Assume: Surfaces have been locked and clipping was
     * performed.
     */

    /* BitBltSimulation>>#copyBitsLockedAndClipped */
    private void copyBitsLockedAndClipped() {
        copyBitsLockedAndClipped(-1);
    }

    private void copyBitsLockedAndClipped(final long factorOrMinusOne) {
        copyBitsRule41Test();
        if (failed()) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        if (tryCopyingBitsQuickly()) {
            return;
        }
        if (combinationRule >= 30 && combinationRule <= 0x1F) {
            /* Check and fetch source alpha parameter for alpha blend */
            if (factorOrMinusOne == -1) {
                PrimitiveFailed.andTransferToInterpreter();
            }
            sourceAlpha = factorOrMinusOne;
            if (failed() || sourceAlpha < 0 || sourceAlpha > 0xFF) {
                PrimitiveFailed.andTransferToInterpreter();
            }
        }
        /* Choose and perform the actual copy loop. */
        bitCount = 0;
        performCopyLoop();
        if (combinationRule >= 30 && combinationRule <= 0x1F) {
            /* zero width and height; just return the count */
            affectedL = affectedR = affectedT = affectedB = 0;
        } else {
            if (hDir > 0) {
                affectedL = dx;
                affectedR = dx + bbW;
            } else {
                affectedL = dx - bbW + 1;
                affectedR = dx + 1;
            }
            if (vDir > 0) {
                affectedT = dy;
                affectedB = dy + bbH;
            } else {
                affectedT = dy - bbH + 1;
                affectedB = dy + 1;
            }
        }
    }

    /*
     * Test possible use of rule 41, rgbComponentAlpha:with: Nothing to return, just set up some
     * variables
     */

    /* BitBltSimulation>>#copyBitsRule41Test */
    private void copyBitsRule41Test() {
        if (combinationRule == 41) {
            /* fetch the forecolor into componentAlphaModeColor. */
            componentAlphaModeAlpha = 0xFF;
            componentAlphaModeColor = 0xFFFFFF;
            gammaLookupTable = null;
            ungammaLookupTable = null;

            throw SqueakException.create("Not implemented");
            // TODO: uncomment:
            // if ((methodArgumentCount()) >= 2) {
            // componentAlphaModeAlpha = stackIntegerValue((methodArgumentCount()) - 2);
            // if (failed()) {
            // throw new PrimitiveFailed();
            // }
            // componentAlphaModeColor = stackIntegerValue((methodArgumentCount()) - 1);
            // if (failed()) {
            // throw new PrimitiveFailed();
            // }
            // if ((methodArgumentCount()) == 4) {
            // gammaLookupTableOop = stackObjectValue(1);
            // if (isBytes(gammaLookupTableOop)) {
            // gammaLookupTable = firstIndexableField(gammaLookupTableOop);
            // }
            // ungammaLookupTableOop = stackObjectValue(0);
            // if (isBytes(ungammaLookupTableOop)) {
            // ungammaLookupTable = firstIndexableField(ungammaLookupTableOop);
            // }
            // }
            // } else {
            // if ((methodArgumentCount()) == 1) {
            // componentAlphaModeColor = stackIntegerValue(0);
            // if (failed()) {
            // throw new PrimitiveFailed();
            // }
            // } else {
            // throw new PrimitiveFailed();
            // }
            // }
        }
    }

    /* This version of the inner loop assumes noSource = false. */

    /* BitBltSimulation>>#copyLoop */
    private void copyLoop() {
        /* unskew is a bitShift and MUST remain signed, while skewMask is unsigned. */
        assert !(preload && skew == 0);
        assert -32 <= skew && skew <= 32; // Modified (image uses 31 instead of 32).

        /* Byte delta */
        final long hInc = hDir * 4;
        final long unskew;
        final long skewMask;
        if (skew < 0) {
            unskew = skew + 32;
            skewMask = shl(ALL_ONES, 0 - skew);
        } else {
            if (skew == 0) {
                unskew = 0;
                skewMask = ALL_ONES;
            } else {
                unskew = skew - 32;
                skewMask = shr(ALL_ONES, skew);
            }
        }
        final long halftoneWord;
        final long notSkewMask = ~skewMask;
        if (noHalftone) {
            halftoneWord = ALL_ONES;
            halftoneHeight = 0;
        } else {
            halftoneWord = halftoneLongAt(0);
        }

        /*
         * Here is the vertical loop, in two versions, one for the combinationRule = 3 copy mode,
         * one for the general case.
         */
        if (combinationRule == 3) {
            copyLoopCombinationRule3(halftoneWord, hInc, notSkewMask, skewMask, unskew);
        } else {
            copyLoopGeneralCase(halftoneWord, hInc, notSkewMask, skewMask, unskew);
        }
    }

    private void copyLoopCombinationRule3(final long initialHalftoneWord, final long hInc, final long notSkewMask, final long skewMask, final long unskew) {
        long halftoneWord = initialHalftoneWord;
        int y = dy;
        for (int i = 1; i <= bbH; i++) {
            /*
             * here is the vertical loop for combinationRule = 3 copy mode; no need to call merge
             */
            if (halftoneHeight > 1) {
                /* Otherwise, its always the same */
                halftoneWord = halftoneLongAt(y);
                y += vDir;
            }
            long prevWord;
            if (preload) {
                /* load the 64-bit shifter */
                prevWord = srcLongAt(sourceIndex);
                sourceIndex += hInc;
            } else {
                prevWord = 0;
            }
            destMask = mask1;
            /* pick up next word */
            long thisWord = srcLongAt(sourceIndex);
            sourceIndex += hInc;
            /* 32-bit rotate */
            long skewWord = shift(prevWord & notSkewMask, unskew) | shift(thisWord & skewMask, skew);
            prevWord = thisWord;
            long destWord = dstLongAt(destIndex);
            destWord = destMask & skewWord & halftoneWord | destWord & ~destMask;
            dstLongAtput(destIndex, destWord);
            destIndex += hInc;
            destMask = ALL_ONES;
            if (skew == 0 && halftoneWord == ALL_ONES) {
                /* Very special inner loop for STORE mode with no skew -- just move words */
                if (preload && hDir == 1) {
                    for (long word = 2; word < nWords; word++) {
                        /* Note loop starts with prevWord loaded (due to preload) */
                        dstLongAtput(destIndex, prevWord);
                        destIndex += hInc;
                        prevWord = srcLongAt(sourceIndex);
                        sourceIndex += hInc;
                    }
                } else {
                    for (long word = 2; word < nWords; word++) {
                        thisWord = srcLongAt(sourceIndex);
                        sourceIndex += hInc;
                        dstLongAtput(destIndex, thisWord);
                        destIndex += hInc;
                    }
                    prevWord = thisWord;
                }
            } else {
                for (long word = 2; word < nWords; word++) {
                    thisWord = srcLongAt(sourceIndex);
                    sourceIndex += hInc;
                    /* 32-bit rotate */
                    skewWord = shift(prevWord & notSkewMask, unskew) | shift(thisWord & skewMask, skew);
                    prevWord = thisWord;
                    dstLongAtput(destIndex, skewWord & halftoneWord);
                    destIndex += hInc;
                }
            }
            if (nWords > 1) {
                destMask = mask2;
                if (((skew < 0 ? skewMask >> -skew : skewMask << skew) & mask2) == 0) {
                    /* we don't need more bits, they will all come from prevWord */
                    thisWord = 0;
                } else {
                    thisWord = srcLongAt(sourceIndex);
                }
                sourceIndex += hInc;
                /* 32-bit rotate */
                skewWord = shift(prevWord & notSkewMask, unskew) | shift(thisWord & skewMask, skew);
                destWord = dstLongAt(destIndex);
                destWord = destMask & skewWord & halftoneWord | destWord & ~destMask;
                dstLongAtput(destIndex, destWord);
                destIndex += hInc;
            }
            sourceIndex += sourceDelta;
            destIndex += destDelta;
        }
    }

    private void copyLoopGeneralCase(final long initialHalftoneWord, final long hInc, final long notSkewMask, final long skewMask, final long unskew) {
        long halftoneWord = initialHalftoneWord;
        int y = dy;
        final LongBinaryOperator mergeFnwith = opTable[combinationRule + 1];
        assert mergeFnwith != null : "Unexpected `null` value";

        for (int i = 1; i <= bbH; i++) {
            /* here is the vertical loop for the general case (combinationRule ~= 3) */
            if (halftoneHeight > 1) {
                /* Otherwise, its always the same */
                halftoneWord = halftoneLongAt(y);
                y += vDir;
            }
            long prevWord;
            if (preload) {
                /* load the 64-bit shifter */
                prevWord = srcLongAt(sourceIndex);
                sourceIndex += hInc;
            } else {
                prevWord = 0;
            }
            destMask = mask1;
            /* pick up next word */
            long thisWord = srcLongAt(sourceIndex);
            sourceIndex += hInc;
            /* 32-bit rotate */
            long skewWord = shift(prevWord & notSkewMask, unskew) | shift(thisWord & skewMask, skew);
            prevWord = thisWord;
            long destWord = dstLongAt(destIndex);
            long mergeWord = mergeFnwith.applyAsLong(skewWord & halftoneWord, destWord);
            destWord = destMask & mergeWord | destWord & ~destMask;
            dstLongAtput(destIndex, destWord);
            destIndex += hInc;
            destMask = ALL_ONES;
            for (long word = 2; word < nWords; word++) {
                /* Normal inner loop does merge: */
                /* pick up next word */
                thisWord = srcLongAt(sourceIndex);
                sourceIndex += hInc;
                /* 32-bit rotate */
                skewWord = (unskew < 0 ? (prevWord & notSkewMask) >>> -unskew : (prevWord & notSkewMask) << unskew) |
                                (skew < 0 ? (thisWord & skewMask) >>> -skew : (thisWord & skewMask) << skew);
                prevWord = thisWord;
                mergeWord = mergeFnwith.applyAsLong(skewWord & halftoneWord, dstLongAt(destIndex));
                dstLongAtput(destIndex, mergeWord);
                destIndex += hInc;
            }
            if (nWords > 1) {
                destMask = mask2;
                if (((skew < 0 ? skewMask >> -skew : skewMask << skew) & mask2) == 0) {
                    /* we don't need more bits, they will all come from prevWord */
                    thisWord = 0;
                } else {
                    thisWord = srcLongAt(sourceIndex);
                }
                sourceIndex += hInc;
                /* 32-bit rotate */
                skewWord = (unskew < 0 ? (prevWord & notSkewMask) >>> -unskew : (prevWord & notSkewMask) << unskew) |
                                (skew < 0 ? (thisWord & skewMask) >>> -skew : (thisWord & skewMask) << skew);
                destWord = dstLongAt(destIndex);
                mergeWord = mergeFnwith.applyAsLong(skewWord & halftoneWord, destWord);
                destWord = destMask & mergeWord | destWord & ~destMask;
                dstLongAtput(destIndex, destWord);
                destIndex += hInc;
            }
            sourceIndex += sourceDelta;
            destIndex += destDelta;
        }
    }

    /*
     * Faster copyLoop when source not used. hDir and vDir are both positive, and perload and skew
     * are unused
     */

    /* BitBltSimulation>>#copyLoopNoSource */
    private void copyLoopNoSource() {
        long halftoneWord = 0;
        final LongBinaryOperator mergeFnwith = opTable[combinationRule + 1];
        assert mergeFnwith != null : "Unexpected `null` value";
        if (noHalftone) {
            halftoneWord = ALL_ONES;
        }
        for (int i = 1; i <= bbH; i++) {
            /* here is the vertical loop */
            if (!noHalftone) {
                halftoneWord = halftoneLongAt(dy + i - 1);
            }
            destMask = mask1;
            long destWord = dstLongAt(destIndex);
            long mergeWord = mergeFnwith.applyAsLong(halftoneWord, destWord);
            destWord = destMask & mergeWord | destWord & ~destMask;
            dstLongAtput(destIndex, destWord);
            destIndex += 4;
            destMask = ALL_ONES;
            if (combinationRule == 3) {
                /* Special inner loop for STORE */
                destWord = halftoneWord;
                for (long word = 2; word < nWords; word++) {
                    dstLongAtput(destIndex, destWord);
                    destIndex += 4;
                }
            } else {
                /* Normal inner loop does merge */
                for (long word = 2; word < nWords; word++) {
                    /* Normal inner loop does merge */
                    destWord = dstLongAt(destIndex);
                    mergeWord = mergeFnwith.applyAsLong(halftoneWord, destWord);
                    dstLongAtput(destIndex, mergeWord);
                    destIndex += 4;
                }
            }
            if (nWords > 1) {
                destMask = mask2;
                destWord = dstLongAt(destIndex);
                mergeWord = mergeFnwith.applyAsLong(halftoneWord, destWord);
                destWord = destMask & mergeWord | destWord & ~destMask;
                dstLongAtput(destIndex, destWord);
                destIndex += 4;
            }
            destIndex += destDelta;
        }
    }

    /*
     * This version of the inner loop maps source pixels to a destination form with different depth.
     * Because it is already unweildy, the loop is not unrolled as in the other versions. Preload,
     * skew and skewMask are all overlooked, since pickSourcePixels delivers its destination word
     * already properly aligned. Note that pickSourcePixels could be copied in-line at the top of
     * the horizontal loop, and some of its inits moved out of the loop.
     */
    /*
     * ar 12/7/1999: The loop has been rewritten to use only one pickSourcePixels call. The idea is
     * that the call itself could be inlined. If we decide not to inline pickSourcePixels we could
     * optimize the loop instead.
     */

    /* BitBltSimulation>>#copyLoopPixMap */
    private void copyLoopPixMap() {
        long halftoneWord = 0;
        final LongBinaryOperator mergeFnwith = opTable[combinationRule + 1];
        assert mergeFnwith != null : "Unexpected `null` value";
        sourcePPW = div(32, sourceDepth);
        final long sourcePixMask = MASK_TABLE[sourceDepth];
        final long destPixMask = MASK_TABLE[destDepth];
        final long mapperFlags = cmFlags & ~COLOR_MAP_NEW_STYLE;
        sourceIndex = sy * sourcePitch + div(sx, sourcePPW) * 4;
        final long scrStartBits = sourcePPW - (sx & sourcePPW - 1);
        final int nSourceIncs;
        if (bbW < scrStartBits) {
            nSourceIncs = 0;
        } else {
            nSourceIncs = div(bbW - scrStartBits, sourcePPW) + 1;
        }
        /* Note following two items were already calculated in destmask setup! */
        sourceDelta = sourcePitch - nSourceIncs * 4;
        long startBits = destPPW - (dx & destPPW - 1);
        final int endBits = (dx + bbW - 1 & destPPW - 1) + 1;
        if (bbW < startBits) {
            startBits = bbW;
        }
        long srcShift = (sx & sourcePPW - 1) * sourceDepth;
        long dstShift = (dx & destPPW - 1) * destDepth;
        int srcShiftInc = sourceDepth;
        int dstShiftInc = destDepth;
        long dstShiftLeft = 0;
        if (sourceMSB) {
            srcShift = 32 - sourceDepth - srcShift;
            srcShiftInc = 0 - srcShiftInc;
        }
        if (destMSB) {
            dstShift = 32 - destDepth - dstShift;
            dstShiftInc = 0 - dstShiftInc;
            dstShiftLeft = 32 - destDepth;
        }
        if (noHalftone) {
            halftoneWord = ALL_ONES;
        }
        for (int i = 1; i <= bbH; i++) {
            /* here is the vertical loop */
            if (!noHalftone) {
                halftoneWord = halftoneLongAt(dy + i - 1);
            }
            srcBitShift = srcShift;
            dstBitShift = dstShift;
            destMask = mask1;
            /* Here is the horizontal loop... */
            long nPix = startBits;
            long words = nWords;
            do {
                /* align next word to leftmost pixel */
                final long skewWord = pickSourcePixelsflagssrcMaskdestMasksrcShiftIncdstShiftInc(nPix, mapperFlags, sourcePixMask, destPixMask, srcShiftInc, dstShiftInc);
                dstBitShift = dstShiftLeft;
                if (destMask == ALL_ONES) {
                    /* avoid read-modify-write */
                    final long mergeWord = mergeFnwith.applyAsLong(skewWord & halftoneWord, dstLongAt(destIndex));
                    dstLongAtput(destIndex, destMask & mergeWord);
                } else {
                    /* General version using dest masking */
                    long destWord = dstLongAt(destIndex);
                    final long mergeWord = mergeFnwith.applyAsLong(skewWord & halftoneWord, destWord & destMask);
                    destWord = destMask & mergeWord | destWord & ~destMask;
                    dstLongAtput(destIndex, destWord);
                }
                destIndex += 4;
                if (words == 2) {
                    /* e.g., is the next word the last word? */
                    /* set mask for last word in this row */
                    destMask = mask2;
                    nPix = endBits;
                } else {
                    /* use fullword mask for inner loop */
                    destMask = ALL_ONES;
                    nPix = destPPW;
                }
            } while (--words > 0);
            sourceIndex += sourceDelta;
            destIndex += destDelta;
        }
    }

    /* Utility routine for computing Warp increments. */

    /* BitBltSimulation>>#deltaFrom:to:nSteps: */
    private static int deltaFromtonSteps(final int x1, final int x2, final int n) {
        if (x2 > x1) {
            return div(x2 - x1 + FIXED_PT1, n + 1) + 1;
        } else {
            if (x2 == x1) {
                return 0;
            }
            return 0 - (div(x1 - x2 + FIXED_PT1, n + 1) + 1);
        }
    }

    /* BitBltSimulation>>#destinationWord:with: */
    private static long destinationWordwith(@SuppressWarnings("unused") final long sourceWord, final long destinationWord) {
        return destinationWord;
    }

    /* Compute masks for left and right destination words */

    /* BitBltSimulation>>#destMaskAndPointerInit */
    private void destMaskAndPointerInit() {
        /* A mask, assuming power of two */
        /* how many pixels in first word */
        final long pixPerM1 = destPPW - 1;
        /* how many pixels in last word */
        final long startBits = destPPW - (dx & pixPerM1);
        final long endBits = (dx + bbW - 1 & pixPerM1) + 1;
        if (destMSB) {
            mask1 = (int) shr(ALL_ONES, 32 - startBits * destDepth);
            mask2 = (int) shl(ALL_ONES, 32 - endBits * destDepth) & ALL_ONES;
        } else {
            mask1 = (int) shl(ALL_ONES, 32 - startBits * destDepth) & ALL_ONES;
            mask2 = (int) shr(ALL_ONES, 32 - endBits * destDepth);
        }
        if (bbW <= startBits) {
            mask1 = mask1 & mask2;
            mask2 = 0;
            nWords = 1;
        } else {
            nWords = div(bbW - startBits + pixPerM1, destPPW) + 1;
        }
        /* defaults for no overlap with source */
        /* calculate byte addr and delta, based on first word of data */
        /* Note pitch is bytes and nWords is longs, not bytes */
        hDir = vDir = 1;
        destIndex = dy * destPitch + div(dx, destPPW) * 4;
        /* byte addr delta */
        destDelta = destPitch * vDir - 4 * (nWords * hDir);
    }

    /* Dither the given 32bit word to 16 bit. Ignore alpha. */

    /* BitBltSimulation>>#dither32To16:threshold: */
    private static long dither32To16threshold(final long srcWord, final long ditherValue) {
        final long addThreshold;

        addThreshold = ditherValue << 8;
        return ((long) DITHER_8_LOOKUP[(int) (addThreshold + (srcWord >>> 16 & 0xFF))] << 10) +
                        ((long) DITHER_8_LOOKUP[(int) (addThreshold + (srcWord >>> 8 & 0xFF))] << 5) +
                        DITHER_8_LOOKUP[(int) (addThreshold + (srcWord & 0xFF))];
    }

    /*
     * This is the primitive implementation of the line-drawing loop. See the comments in
     * BitBlt>>drawLoopX:Y:
     */

    /* BitBltSimulation>>#drawLoopX:Y: */
    private void drawLoopXY(final long xDelta, final long yDelta) {
        final long dx1;
        if (xDelta > 0) {
            dx1 = 1;
        } else {
            if (xDelta == 0) {
                dx1 = 0;
            } else {
                dx1 = -1;
            }
        }
        final long dy1;
        if (yDelta > 0) {
            dy1 = 1;
        } else {
            if (yDelta == 0) {
                dy1 = 0;
            } else {
                dy1 = -1;
            }
        }
        final long px = Math.abs(yDelta);
        final long py = Math.abs(xDelta);
        /* init null rectangle */
        int affL = 9999;
        int affR = -9999;
        int affT = 9999;
        int affB = -9999;
        if (py > px) {
            /* more horizontal */
            long p = py / 2;
            for (int i = 1; i <= py; i++) {
                destX += dx1;
                if ((p -= px) < 0) {
                    destY += dy1;
                    p += py;
                }
                if (i < py) {
                    copyBits();
                    if (failed()) {
                        return;
                    }
                    if (affectedL < affectedR && affectedT < affectedB) {
                        /* Affected rectangle grows along the line */
                        affL = affL < affectedL ? affL : affectedL;
                        affR = affR < affectedR ? affectedR : affR;
                        affT = affT < affectedT ? affT : affectedT;
                        affB = affB < affectedB ? affectedB : affB;
                        if ((affR - affL) * (affB - affT) > 4000) {
                            /* If affected rectangle gets large, update it in chunks */
                            affectedL = affL;
                            affectedR = affR;
                            affectedT = affT;
                            affectedB = affB;
                            showDisplayBits();
                            /* init null rectangle */
                            affL = affT = 9999;
                            affR = affB = -9999;
                        }
                    }
                }
            }
        } else {
            /* more vertical */
            long p = px / 2;
            for (int i = 1; i <= px; i++) {
                destY += dy1;
                if ((p -= py) < 0) {
                    destX += dx1;
                    p += px;
                }
                if (i < px) {
                    copyBits();
                    if (failed()) {
                        return;
                    }
                    if (affectedL < affectedR && affectedT < affectedB) {
                        /* Affected rectangle grows along the line */
                        affL = affL < affectedL ? affL : affectedL;
                        affR = affR < affectedR ? affectedR : affR;
                        affT = affT < affectedT ? affT : affectedT;
                        affB = affB < affectedB ? affectedB : affB;
                        if ((affR - affL) * (affB - affT) > 4000) {
                            /* If affected rectangle gets large, update it in chunks */
                            affectedL = affL;
                            affectedR = affR;
                            affectedT = affT;
                            affectedB = affB;
                            showDisplayBits();
                            /* init null rectangle */
                            affL = affT = 9999;
                            affR = affB = -9999;
                        }
                    }
                }
            }
        }
        affectedL = affL;
        affectedR = affR;
        affectedT = affT;
        /* store destX, Y back */
        affectedB = affB;
        storeIntegerofObjectwithValue(BB_DEST_X_INDEX, bitBltOop, destX);
        storeIntegerofObjectwithValue(BB_DEST_Y_INDEX, bitBltOop, destY);
    }

    /* Dither the given 32bit word to 16 bit. Ignore alpha. */

    /* BitBltSimulation>>#expensiveDither32To16:threshold: */
    private static long expensiveDither32To16threshold(final long srcWord, final long ditherValue) {
        int pv = (int) (srcWord & 0xFF);
        int threshold = DITHER_THRESHOLDS_16[pv & 7];
        int value = DITHER_VALUES_16[pv >>> 3];
        int out;
        if (ditherValue < threshold) {
            out = value + 1;
        } else {
            out = value;
        }
        pv = (int) (srcWord >>> 8 & 0xFF);
        threshold = DITHER_THRESHOLDS_16[pv & 7];
        value = DITHER_VALUES_16[pv >>> 3];
        if (ditherValue < threshold) {
            out = out | value + 1 << 5;
        } else {
            out = out | value << 5;
        }
        pv = (int) (srcWord >>> 16 & 0xFF);
        threshold = DITHER_THRESHOLDS_16[pv & 7];
        value = DITHER_VALUES_16[pv >>> 3];
        if (ditherValue < threshold) {
            out = out | value + 1 << 10;
        } else {
            out = out | value << 10;
        }
        return out;
    }

    /*
     * Return the integer value of the given field of the given object. If the field contains a
     * Float, truncate it and return its integral part. Fail if the given field does not contain a
     * small integer or Float, or if the truncated Float is out of the range of small integers.
     */

    /* BitBltSimulation>>#fetchIntOrFloat:ofObject: */
    private static int fetchIntOrFloatofObject(final int fieldIndex, final PointersObject objectPointer) {
        final Object fieldOop = fetchPointerofObject(fieldIndex, objectPointer);
        if (fieldOop instanceof Long) {
            final long longValue = (long) fieldOop;
            if (Integer.MIN_VALUE <= longValue && longValue <= Integer.MAX_VALUE) {
                return (int) longValue;
            }
            PrimitiveFailed.andTransferToInterpreter(); // Fail because value is too big.
        } else if (fieldOop instanceof FloatObject) {
            return floatToLong(((FloatObject) fieldOop).getValue());
        } else if (fieldOop instanceof Double) {
            return floatToLong((double) fieldOop);
        }
        /* Fail if the value is not an int or float (e.g. Fraction). */
        throw PrimitiveFailed.andTransferToInterpreter();
    }

    private static int floatToLong(final double floatValue) {
        if (!(-2.147483648e9 <= floatValue && floatValue <= 2.147483647e9)) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        return (int) floatValue;
    }

    /*
     * Return the integer value of the given field of the given object. If the field contains a
     * Float, truncate it and return its integral part. Fail if the given field does not contain a
     * small integer or Float, or if the truncated Float is out of the range of small integers.
     */

    /* BitBltSimulation>>#fetchIntOrFloat:ofObject:ifNil: */
    private static int fetchIntOrFloatofObjectifNil(final int fieldIndex, final PointersObject objectPointer, final long defaultValue) {
        final Object fieldOop = fetchPointerofObject(fieldIndex, objectPointer);
        if (fieldOop instanceof Long) {
            final long longValue = (long) fieldOop;
            if ((int) longValue == longValue) {
                return (int) longValue;
            } else {
                PrimitiveFailed.andTransferToInterpreter(); // Fail because longValue is too big.
            }
        }
        if (fieldOop == NilObject.SINGLETON) {
            return (int) defaultValue;
        } else if (fieldOop instanceof Double) {
            return floatToLong((double) fieldOop);
        } else if (fieldOop instanceof FloatObject) {
            return floatToLong(((FloatObject) fieldOop).getValue());
        }
        /* Fail if the value is not an int or float (e.g. Fraction). */
        throw PrimitiveFailed.andTransferToInterpreter();
    }

    /*
     * For any non-zero pixel value in destinationWord with zero alpha channel take the alpha from
     * sourceWord and fill it in. Intended for fixing alpha channels left at zero during 16->32 bpp
     * conversions.
     */

    /* BitBltSimulation>>#fixAlpha:with: */
    private long fixAlphawith(final long sourceWord, final long destinationWord) {
        if (destDepth != 32) {
            return destinationWord;
        }
        if (destinationWord == 0) {
            return 0;
        }
        if ((destinationWord & 0xFF000000L) != 0) {
            return destinationWord;
        }
        return destinationWord | sourceWord & 0xFF000000L;
    }

    /*
     * Note: This is hardcoded so it can be run from Squeak. The module name is used for validating
     * a module *after* it is loaded to check if it does really contain the module we're thinking it
     * contains. This is important!
     */

    /* InterpreterPlugin>>#getModuleName */
    public static String getModuleName() {
        return MODULE_NAME;
    }

    /* BitBltSimulation>>#ignoreSourceOrHalftone: */
    private boolean ignoreSourceOrHalftone(final Object formPointer) {
        return formPointer == null || combinationRule == 0 || combinationRule == 5 || combinationRule == 10 || combinationRule == 15;
    }

    /* BitBltSimulation>>#initBBOpTable */
    private void initBBOpTable() {
        CompilerDirectives.transferToInterpreterAndInvalidate();
        opTable[0 + 1] = BitBlt::clearWordwith;
        opTable[1 + 1] = BitBlt::bitAndwith;
        opTable[2 + 1] = BitBlt::bitAndInvertwith;
        opTable[3 + 1] = BitBlt::sourceWordwith;
        opTable[4 + 1] = BitBlt::bitInvertAndwith;
        opTable[5 + 1] = BitBlt::destinationWordwith;
        opTable[6 + 1] = BitBlt::bitXorwith;
        opTable[7 + 1] = BitBlt::bitOrwith;
        opTable[8 + 1] = BitBlt::bitInvertAndInvertwith;
        opTable[9 + 1] = BitBlt::bitInvertXorwith;
        opTable[10 + 1] = BitBlt::bitInvertDestinationwith;
        opTable[11 + 1] = BitBlt::bitOrInvertwith;
        opTable[12 + 1] = BitBlt::bitInvertSourcewith;
        opTable[13 + 1] = BitBlt::bitInvertOrwith;
        opTable[14 + 1] = BitBlt::bitInvertOrInvertwith;
        opTable[15 + 1] = BitBlt::destinationWordwith;
        opTable[16 + 1] = BitBlt::destinationWordwith;
        opTable[17 + 1] = BitBlt::destinationWordwith;
        opTable[18 + 1] = BitBlt::addWordwith;
        opTable[19 + 1] = BitBlt::subWordwith;
        opTable[20 + 1] = this::rgbAddwith;
        opTable[21 + 1] = this::rgbSubwith;
        opTable[22 + 1] = this::oLDrgbDiffwith;
        opTable[23 + 1] = this::oLDtallyIntoMapwith;
        opTable[24 + 1] = this::alphaBlendwith;
        opTable[25 + 1] = this::pixPaintwith;
        opTable[26 + 1] = this::pixMaskwith;
        opTable[27 + 1] = this::rgbMaxwith;
        opTable[28 + 1] = this::rgbMinwith;
        opTable[29 + 1] = this::rgbMinInvertwith;
        opTable[30 + 1] = this::alphaBlendConstwith;
        opTable[31 + 1] = this::alphaPaintConstwith;
        opTable[32 + 1] = this::rgbDiffwith;
        opTable[33 + 1] = this::tallyIntoMapwith;
        opTable[34 + 1] = this::alphaBlendScaledwith;
        opTable[35 + 1] = this::alphaBlendScaledwith;
        opTable[36 + 1] = this::alphaBlendScaledwith;
        opTable[37 + 1] = this::rgbMulwith;
        opTable[38 + 1] = this::pixSwapwith;
        opTable[39 + 1] = this::pixClearwith;
        opTable[40 + 1] = this::fixAlphawith;
        opTable[41 + 1] = this::rgbComponentAlphawith;
    }

    /* BitBltSimulation>>#initDither8Lookup */
    private static void initDither8Lookup() {
        CompilerDirectives.transferToInterpreterAndInvalidate();
        for (int b = 0; b <= 0xFF; b++) {
            for (int t = 0; t <= 15; t++) {
                DITHER_8_LOOKUP[(int) (((long) t << 8) + b)] = (int) expensiveDither32To16threshold(b, t);
            }
        }
    }

    /* BitBltSimulation>>#initialiseModule */
    private void initialiseModule() {
        initBBOpTable();
        initDither8Lookup();
    }

    /* Return true if shiftTable/maskTable define an identity mapping. */

    /* BitBltSimulation>>#isIdentityMap:with: */
    private static boolean isIdentityMapwith(final int[] shifts, final int[] masks) {
        if (shifts == null || masks == null) {
            return true;
        }
        return shifts[RED_INDEX] == 0 && shifts[GREEN_INDEX] == 0 && shifts[BLUE_INDEX] == 0 && shifts[ALPHA_INDEX] == 0 &&
                        masks[RED_INDEX] == 0xFF0000 && masks[GREEN_INDEX] == 0xFF00 && masks[BLUE_INDEX] == 0xFF && masks[ALPHA_INDEX] == 0xFF000000;
    }

    /*
     * Load the dest form for BitBlt. Answer false if anything is wrong, true otherwise.
     */

    /* BitBltSimulation>>#loadBitBltDestForm */
    private boolean loadBitBltDestForm() {
        if (!(isPointers(destForm) && slotSizeOf(destForm) >= 4)) {
            return false;
        }
        final Object destBitsValue = fetchPointerofObject(FORM.BITS, destForm);
        destWidth = fetchIntegerofObject(FORM.WIDTH, destForm);
        destHeight = fetchIntegerofObject(FORM.HEIGHT, destForm);
        if (!(destWidth >= 0 && destHeight >= 0)) {
            return false;
        }
        destDepth = fetchIntegerofObject(FORM.DEPTH, destForm);
        if (!(destMSB = destDepth > 0)) {
            destDepth = 0 - destDepth;
        }
        if (!isWordsOrBytes(destBitsValue)) {
            if (destBitsValue instanceof Long) {
                throw SqueakException.create("Not supported: Query for actual surface dimensions");
            } else {
                return false;
            }
        }
        destPPW = div(32, destDepth);
        destPitch = div(destWidth + destPPW - 1, destPPW) * 4;
        final NativeObject destBitsNative = (NativeObject) destBitsValue;
        final long destBitsSize;
        if (isWords(destBitsNative)) {
            destBits = destBitsNative.getIntStorage();
            destBitsSize = destBitsNative.getIntLength() * Integer.BYTES;
            destBitsBaseOffset = Unsafe.ARRAY_INT_BASE_OFFSET;
            destBitsIndexScale = Unsafe.ARRAY_INT_INDEX_SCALE;
        } else {
            destBits = destBitsNative.getByteStorage();
            destBitsSize = destBitsNative.getByteLength();
            destBitsBaseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET;
            destBitsIndexScale = Unsafe.ARRAY_BYTE_INDEX_SCALE * Integer.BYTES;
        }
        return destBitsSize >= destPitch * destHeight;
    }

    /*
     * Load BitBlt from the oop. This function is exported for the Balloon engine.
     */

    /* BitBltSimulation>>#loadBitBltFrom: */
    protected boolean loadBitBltFrom(final PointersObject bbObj) {
        return loadBitBltFromwarping(bbObj, false);
    }

    /* Load context from BitBlt instance. Return false if anything is amiss */
    /*
     * NOTE this should all be changed to minX/maxX coordinates for simpler clipping -- once it
     * works!
     */

    /* BitBltSimulation>>#loadBitBltFrom:warping: */
    private boolean loadBitBltFromwarping(final PointersObject bbObj, final boolean aBool) {
        boolean ok;

        bitBltOop = bbObj;
        isWarping = aBool;
        combinationRule = fetchIntegerofObject(BB_RULE_INDEX, bitBltOop);
        if (failed() || combinationRule < 0 || combinationRule > OP_TABLE_SIZE - 2) {
            return false;
        }
        if (combinationRule >= 16 && combinationRule <= 17) {
            return false;
        }
        sourceForm = fetchPointerofObjectOrNull(BB_SOURCE_FORM_INDEX, bitBltOop);
        noSource = ignoreSourceOrHalftone(sourceForm);
        halftoneForm = (AbstractSqueakObject) fetchPointerofObject(BB_HALFTONE_FORM_INDEX, bitBltOop);
        noHalftone = ignoreSourceOrHalftone(halftoneForm == NilObject.SINGLETON ? null : halftoneForm);
        destForm = fetchPointerofObjectOrNull(BB_DEST_FORM_INDEX, bitBltOop);
        ok = loadBitBltDestForm();
        if (!ok) {
            return false;
        }
        destX = fetchIntOrFloatofObjectifNil(BB_DEST_X_INDEX, bitBltOop, 0);
        destY = fetchIntOrFloatofObjectifNil(BB_DEST_Y_INDEX, bitBltOop, 0);
        width = fetchIntOrFloatofObjectifNil(BB_WIDTH_INDEX, bitBltOop, destWidth);
        height = fetchIntOrFloatofObjectifNil(BB_HEIGHT_INDEX, bitBltOop, destHeight);
        if (failed()) {
            return false;
        }
        if (noSource) {
            sourceX = sourceY = 0;
        } else {
            ok = loadBitBltSourceForm();
            if (!ok) {
                return false;
            }
            ok = loadColorMap();
            if (!ok) {
                return false;
            }
            if ((cmFlags & COLOR_MAP_NEW_STYLE) == 0) {
                setupColorMasks();
            }
            sourceX = fetchIntOrFloatofObjectifNil(BB_SOURCE_X_INDEX, bitBltOop, 0);
            sourceY = fetchIntOrFloatofObjectifNil(BB_SOURCE_Y_INDEX, bitBltOop, 0);
        }
        ok = loadHalftoneForm();
        if (!ok) {
            return false;
        }
        clipX = fetchIntOrFloatofObjectifNil(BB_CLIP_X_INDEX, bitBltOop, 0);
        clipY = fetchIntOrFloatofObjectifNil(BB_CLIP_Y_INDEX, bitBltOop, 0);
        clipWidth = fetchIntOrFloatofObjectifNil(BB_CLIP_WIDTH_INDEX, bitBltOop, destWidth);
        clipHeight = fetchIntOrFloatofObjectifNil(BB_CLIP_HEIGHT_INDEX, bitBltOop, destHeight);
        if (failed()) {
            return false;
        }
        if (clipX < 0) {
            clipWidth += clipX;
            clipX = 0;
        }
        if (clipY < 0) {
            clipHeight += clipY;
            clipY = 0;
        }
        if (clipX + clipWidth > destWidth) {
            clipWidth = destWidth - clipX;
        }
        if (clipY + clipHeight > destHeight) {
            clipHeight = destHeight - clipY;
        }
        return true;
    }

    /*
     * Load the source form for BitBlt. Return false if anything is wrong, true otherwise.
     */

    /* BitBltSimulation>>#loadBitBltSourceForm */
    private boolean loadBitBltSourceForm() {
        if (!(isPointers(sourceForm) && slotSizeOf(sourceForm) >= 4)) {
            return false;
        }
        final Object sourceBitsValue = fetchPointerofObject(FORM.BITS, sourceForm);
        sourceWidth = fetchIntOrFloatofObject(FORM.WIDTH, sourceForm);
        sourceHeight = fetchIntOrFloatofObject(FORM.HEIGHT, sourceForm);
        if (!(sourceWidth >= 0 && sourceHeight >= 0)) {
            return false;
        }
        sourceDepth = fetchIntegerofObject(FORM.DEPTH, sourceForm);
        if (!(sourceMSB = sourceDepth > 0)) {
            sourceDepth = 0 - sourceDepth;
        }
        if (!isWordsOrBytes(sourceBitsValue)) {
            if (sourceBitsValue instanceof Long) {
                throw SqueakException.create("Not supported: Query for actual surface dimensions");
            } else {
                return false;
            }
        }
        sourcePPW = div(32, sourceDepth);
        sourcePitch = div(sourceWidth + sourcePPW - 1, sourcePPW) * 4;
        final NativeObject sourceBitsNative = (NativeObject) sourceBitsValue;
        if (isWords(sourceBitsNative)) {
            final int[] ints = sourceBitsNative.getIntStorage();
            sourceBits = ints;
            sourceBitsBaseOffset = Unsafe.ARRAY_INT_BASE_OFFSET;
            sourceBitsIndexScale = Unsafe.ARRAY_INT_INDEX_SCALE;
            return ints.length * Integer.BYTES >= sourcePitch * sourceHeight;
        } else {
            final byte[] bytes = sourceBitsNative.getByteStorage();
            if (bytes.length >= sourcePitch * sourceHeight) {
                sourceBits = bytes;
                sourceBitsBaseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET;
                sourceBitsIndexScale = Unsafe.ARRAY_BYTE_INDEX_SCALE * Integer.BYTES;
                return true;
            } else {
                return false;
            }
        }
    }

    /*
     * ColorMap, if not nil, must be longWords, and 2^N long, where N = sourceDepth for 1, 2, 4, 8
     * bits, or N = 9, 12, or 15 (3, 4, 5 bits per color) for 16 or 32 bits.
     */

    /* BitBltSimulation>>#loadColorMap */
    private boolean loadColorMap() {
        cmFlags = cmMask = cmBitsPerColor = 0;
        cmShiftTable = null;
        cmMaskTable = null;
        cmLookupTable = null;
        final Object cmOop = fetchPointerofObject(BB_COLOR_MAP_INDEX, bitBltOop);
        if (cmOop == NilObject.SINGLETON) {
            return true;
        }

        /* even if identity or somesuch - may be cleared later */
        cmFlags = COLOR_MAP_PRESENT;
        boolean oldStyle = false;
        final long cmSize;
        if (cmOop instanceof NativeObject && isWords((NativeObject) cmOop)) {
            /* This is an old-style color map (indexed only, with implicit RGBA conversion) */
            cmSize = slotSizeOfWords((NativeObject) cmOop);
            cmLookupTable = ((NativeObject) cmOop).getIntStorage();
            oldStyle = true;
        } else {

            /* A new-style color map (fully qualified) */
            if (!(isPointers(cmOop) && slotSizeOf((PointersObject) cmOop) >= 3)) {
                return false;
            }
            final PointersObject cmOopPointers = (PointersObject) cmOop;
            cmShiftTable = loadColorMapShiftOrMaskFrom(fetchNativeofObjectOrNull(0, cmOopPointers));
            cmMaskTable = loadColorMapShiftOrMaskFrom(fetchNativeofObjectOrNull(1, cmOopPointers));
            final NativeObject oop = fetchNativeofObjectOrNull(2, cmOopPointers);
            if (oop == null) {
                cmSize = 0;
            } else {
                if (!isWords(oop)) {
                    return false;
                }
                cmLookupTable = oop.getIntStorage();
                cmSize = cmLookupTable.length;
            }
            cmFlags = cmFlags | COLOR_MAP_NEW_STYLE;
        }
        if ((cmSize & cmSize - 1) != 0) {
            return false;
        }
        cmMask = cmSize - 1;
        cmBitsPerColor = 0;
        if (cmSize == 512) {
            cmBitsPerColor = 3;
        }
        if (cmSize == 4096) {
            cmBitsPerColor = 4;
        }
        if (cmSize == 32768) {
            cmBitsPerColor = 5;
        }
        if (cmSize == 0) {
            cmLookupTable = null;
            cmMask = 0;
        } else {
            cmFlags = cmFlags | COLOR_MAP_INDEXED_PART;
        }
        if (oldStyle) {
            /* needs implicit conversion */
            setupColorMasks();
        }
        if (isIdentityMapwith(cmShiftTable, cmMaskTable)) {
            cmMaskTable = null;
            cmShiftTable = null;
        } else {
            cmFlags = cmFlags | COLOR_MAP_FIXED_PART;
        }
        return true;
    }

    /* BitBltSimulation>>#loadColorMapShiftOrMaskFrom: */
    private static int[] loadColorMapShiftOrMaskFrom(final NativeObject mapOop) {
        if (mapOop == null) {
            return null;
        }
        if (!(isWords(mapOop) && slotSizeOfWords(mapOop) == 4)) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        return mapOop.getIntStorage();
    }

    /* Load the halftone form */

    /* BitBltSimulation>>#loadHalftoneForm */
    private boolean loadHalftoneForm() {
        if (noHalftone) {
            halftoneBits = null;
            return true;
        }
        final NativeObject halftoneBitsValue;
        if (isPointers(halftoneForm) && slotSizeOf((VariablePointersObject) halftoneForm) >= 4) {
            /* Old-style 32xN monochrome halftone Forms */
            halftoneBitsValue = fetchNativeofObjectOrNull(FORM.BITS, (VariablePointersObject) halftoneForm);
            halftoneHeight = fetchIntegerofObject(FORM.HEIGHT, (VariablePointersObject) halftoneForm);
            if (!isWords(halftoneBitsValue)) {
                noHalftone = true;
            } else {
                halftoneBits = halftoneBitsValue.getIntStorage();
            }
        } else {
            /* New spec accepts, basically, a word array */
            if (!isWords(halftoneForm)) {
                return false;
            }
            halftoneBitsValue = (NativeObject) halftoneForm;
            halftoneBits = halftoneBitsValue.getIntStorage();
            halftoneHeight = halftoneBits.length;
        }
        return true;
    }

    /* BitBltSimulation>>#loadSurfacePlugin not needed for TruffleSqueak */

    /* BitBltSimulation>>#loadWarpBltFrom: */
    private boolean loadWarpBltFrom(final PointersObject bbObj) {
        return loadBitBltFromwarping(bbObj, true);
    }

    /* BitBltSimulation>>#lockSurfaces */
    private boolean lockSurfaces() {
        // Actual locking code not needed for TruffleSqueak.
        assert destBits != null : "Unexpected `null` value";
        assert sourceBits != null || noSource;

        endOfSource = noSource || sourceBits == null ? 0 : sourcePitch * sourceHeight;
        endOfDestination = destPitch * destHeight;
        return destBits != null && (sourceBits != null || noSource);
    }

    /* Color map the given source pixel. */

    /* BitBltSimulation>>#mapPixel:flags: */
    private long mapPixelflags(final long sourcePixel, final long mapperFlags) {
        long pv;

        pv = sourcePixel;
        if ((mapperFlags & COLOR_MAP_PRESENT) != 0) {
            if ((mapperFlags & COLOR_MAP_FIXED_PART) != 0) {
                /* avoid introducing transparency by color reduction */
                pv = rgbMapPixelflags(sourcePixel);
                if (pv == 0 && sourcePixel != 0) {
                    pv = 1;
                }
            }
            if ((mapperFlags & COLOR_MAP_INDEXED_PART) != 0) {
                pv = cmLookupTable[(int) (pv & cmMask)];
            }
        }
        return pv;
    }

    /* BitBltSimulation>>#merge:with: omitted (no senders) */

    /*
     * The module with the given name was just unloaded. Make sure we have no dangling references.
     */

    /* BitBltSimulation>>#moduleUnloaded: omitted */

    /*
     * Subract the pixels in the source and destination, color by color, and return the sum of the
     * absolute value of all the differences. For non-rgb, XOR the two and return the number of
     * differing pixels. Note that the region is not clipped to bit boundaries, but only to the
     * nearest (enclosing) word. This is because copyLoop does not do pre-merge masking. For
     * accurate results, you must subtract the values obtained from the left and right fringes.
     */

    /* BitBltSimulation>>#OLDrgbDiff:with: */
    private long oLDrgbDiffwith(final long sourceWord, final long destinationWord) {
        if (destDepth < 16) {
            /* Just xor and count differing bits if not RGB */
            long diff = sourceWord ^ destinationWord;
            final int pixMask = MASK_TABLE[destDepth];
            while (diff != 0) {
                if ((diff & pixMask) != 0) {
                    bitCount++;
                }
                diff = shr(diff, destDepth);
            }
            return destinationWord;
        }
        if (destDepth == 16) {
            long diff = partitionedSubfromnBitsnPartitions(sourceWord, destinationWord, 5, 3);
            bitCount = bitCount + (diff & 0x1F) + (diff >>> 5 & 0x1F) + (diff >>> 10 & 0x1F);
            diff = partitionedSubfromnBitsnPartitions(sourceWord >>> 16, destinationWord >>> 16, 5, 3);
            bitCount = bitCount + (diff & 0x1F) + (diff >>> 5 & 0x1F) + (diff >>> 10 & 0x1F);
        } else {
            final long diff = partitionedSubfromnBitsnPartitions(sourceWord, destinationWord, 8, 3);
            bitCount = bitCount + (diff & 0xFF) + (diff >>> 8 & 0xFF) + (diff >>> 16 & 0xFF);
        }
        return destinationWord;
    }

    /*
     * Tally pixels into the color map. Note that the source should be specified = destination, in
     * order for the proper color map checks to be performed at setup. Note that the region is not
     * clipped to bit boundaries, but only to the nearest (enclosing) word. This is because copyLoop
     * does not do pre-merge masking. For accurate results, you must subtract the values obtained
     * from the left and right fringes.
     */

    /* BitBltSimulation>>#OLDtallyIntoMap:with: */
    private long oLDtallyIntoMapwith(@SuppressWarnings("unused") final long sourceWord, final long destinationWord) {
        if ((cmFlags & (COLOR_MAP_PRESENT | COLOR_MAP_INDEXED_PART)) != (COLOR_MAP_PRESENT | COLOR_MAP_INDEXED_PART)) {
            return destinationWord;
        }
        if (destDepth < 16) {
            /* loop through all packed pixels. */
            final long pixMask = MASK_TABLE[destDepth] & cmMask;
            long shiftWord = destinationWord;
            for (int i = 1; i <= destPPW; i++) {
                tallyMapAtput(shiftWord & pixMask);
                shiftWord = shr(shiftWord, destDepth);
            }
            return destinationWord;
        }
        if (destDepth == 16) {
            /* Two pixels Tally the right half... */
            long mapIndex = rgbMapfromto(destinationWord & 0xFFFF, 5, cmBitsPerColor);
            tallyMapAtput(mapIndex);
            mapIndex = rgbMapfromto(destinationWord >>> 16, 5, cmBitsPerColor);
            tallyMapAtput(mapIndex);
        } else {
            /* Just one pixel. */
            tallyMapAtput(rgbMapfromto(destinationWord, 8, cmBitsPerColor));
        }
        return destinationWord;
    }

    /*
     * Add word1 to word2 as nParts partitions of nBits each. This is useful for packed pixels, or
     * packed colors
     */
    /*
     * Use long everywhere because it has a well known arithmetic model without undefined behavior
     * w.r.t. overflow and shifts
     */

    /* BitBltSimulation>>#partitionedAdd:to:nBits:componentMask:carryOverflowMask: */
    private static long partitionedAddtonBitscomponentMaskcarryOverflowMask(final long word1, final long word2, final long nBits, final long componentMask, final long carryOverflowMask) {
        /* mask to remove high bit of each component */
        final long w1 = word1 & carryOverflowMask;
        final long w2 = word2 & carryOverflowMask;
        /* sum without high bit to avoid overflowing over next component */
        final long sum = (word1 ^ w1) + (word2 ^ w2);
        /* detect overflow condition for saturating */
        final long carryOverflow = w1 & w2 | (w1 | w2) & sum;
        return sum ^ w1 ^ w2 | shr(carryOverflow, nBits - 1) * componentMask;
    }

    /*
     * AND word1 to word2 as nParts partitions of nBits each. Any field of word1 not all-ones is
     * treated as all-zeroes. Used for erasing, eg, brush shapes prior to ORing in a color
     */

    /* BitBltSimulation>>#partitionedAND:to:nBits:nPartitions: */
    private static long partitionedANDtonBitsnPartitions(final long word1, final long word2, final int nBits, final int nParts) {
        /* partition mask starts at the right */
        long mask = MASK_TABLE[nBits];
        long result = 0;
        if (nBits == 32) {
            if (word1 == mask) {
                result = result | word2;
            }
        } else {
            for (int i = 1; i <= nParts; i++) {
                if ((word1 & mask) == mask) {
                    result = result | word2 & mask;
                }
                /* slide left to next partition */
                mask = shl(mask, nBits);
            }
        }
        return result;
    }

    /* Max word1 to word2 as nParts partitions of nBits each */
    /*
     * In C, most arithmetic operations answer the same bit pattern regardless of the operands being
     * signed or longs (this is due to the way 2's complement numbers work). However, comparisions
     * might fail. Add the proper declaration of words as long in those cases where comparisions are
     * done (jmv)
     */

    /* BitBltSimulation>>#partitionedMax:with:nBits:nPartitions: */
    private static long partitionedMaxwithnBitsnPartitions(final long word1, final long word2, final int nBits, final int nParts) {
        if (nBits == 32) {
            return Math.max(word1, word2);
        } else {
            /* partition mask starts at the right */
            long mask = MASK_TABLE[nBits];
            long result = 0;
            for (int i = 1; i <= nParts; i++) {
                result = result | Math.max(word2 & mask, word1 & mask);
                /* slide left to next partition */
                mask = shl(mask, nBits);
            }
            return result;
        }
    }

    /* Min word1 to word2 as nParts partitions of nBits each */
    /*
     * In C, most arithmetic operations answer the same bit pattern regardless of the operands being
     * signed or longs (this is due to the way 2's complement numbers work). However, comparisions
     * might fail. Add the proper declaration of words as long in those cases where comparisions are
     * done (jmv)
     */

    /* BitBltSimulation>>#partitionedMin:with:nBits:nPartitions: */
    private static long partitionedMinwithnBitsnPartitions(final long word1, final long word2, final int nBits, final int nParts) {
        if (nBits == 32) {
            return Math.min(word1, word2);
        } else {
            /* partition mask starts at the right */
            long mask = MASK_TABLE[nBits];
            long result = 0;
            for (int i = 1; i <= nParts; i++) {
                result = result | Math.min(word2 & mask, word1 & mask);
                /* slide left to next partition */
                mask = shl(mask, nBits);
            }
            return result;
        }
    }

    /*
     * Multiply word1 with word2 as nParts partitions of nBits each. This is useful for packed
     * pixels, or packed colors. Bug in loop version when non-white background
     */
    /*
     * In C, integer multiplication might answer a wrong value if the unsigned values are declared
     * as signed. This problem does not affect this method, because the most significant bit (i.e.
     * the sign bit) will always be zero (jmv)
     */

    /* BitBltSimulation>>#partitionedMul:with:nBits:nPartitions: */
    private static long partitionedMulwithnBitsnPartitions(final long word1, final long word2, final int nBits, final int nParts) {
        /* partition mask starts at the right */
        final long sMask = MASK_TABLE[nBits];
        final long dMask = shl(sMask, nBits);
        /* optimized first step */
        long result = shr(((word1 & sMask) + 1) * ((word2 & sMask) + 1) - 1 & dMask, nBits);
        if (nParts == 1) {
            return result;
        }
        long product = ((shr(word1, nBits) & sMask) + 1) * ((shr(word2, nBits) & sMask) + 1) - 1 & dMask;
        result = result | product;
        if (nParts == 2) {
            return result;
        }
        product = ((shr(word1, 2 * nBits) & sMask) + 1) * ((shr(word2, 2 * nBits) & sMask) + 1) - 1 & dMask;
        result = result | shl(product, nBits);
        if (nParts == 3) {
            return result;
        }
        product = ((shr(word1, 3 * nBits) & sMask) + 1) * ((shr(word2, 3 * nBits) & sMask) + 1) - 1 & dMask;
        result = result | shl(product, 2 * nBits);
        return result;
    }

    /* BitBltSimulation>>#partitionedRgbComponentAlpha:dest:nBits:nPartitions: */
    private long partitionedRgbComponentAlphadestnBitsnPartitions(final long sourceWord, final long destWord, final int nBits, final int nParts) {
        /* partition mask starts at the right */
        long mask = MASK_TABLE[nBits];
        long result = 0;
        for (int i = 1; i <= nParts; i++) {
            long p1 = shr(sourceWord & mask, (i - 1) * nBits);
            long p2 = shr(destWord & mask, (i - 1) * nBits);
            if (nBits != 32) {
                if (nBits == 16) {
                    p1 = (p1 & 0x1F) << 3 | (p1 & 0x3E0) << 6 | (p1 & 0x7C00) << 9 | 0xFF000000L;
                    p2 = (p2 & 0x1F) << 3 | (p2 & 0x3E0) << 6 | (p2 & 0x7C00) << 9 | 0xFF000000L;
                } else {
                    p1 = rgbMapfromto(p1, nBits, 32) | 0xFF000000L;
                    p2 = rgbMapfromto(p2, nBits, 32) | 0xFF000000L;
                }
            }
            long v = rgbComponentAlpha32with(p1, p2);
            if (nBits != 32) {
                v = rgbMapfromto(v, 32, nBits);
            }
            result = result | shl(v, (i - 1) * nBits);
            /* slide left to next partition */
            mask = shl(mask, nBits);
        }
        return result;
    }

    /*
     * Subtract word1 from word2 as nParts partitions of nBits each. This is useful for packed
     * pixels, or packed colors
     */
    /*
     * In C, most arithmetic operations answer the same bit pattern regardless of the operands being
     * signed or longs (this is due to the way 2's complement numbers work). However, comparisions
     * might fail. Add the proper declaration of words as long in those cases where comparisions are
     * done (jmv)
     */

    /* BitBltSimulation>>#partitionedSub:from:nBits:nPartitions: */
    private static long partitionedSubfromnBitsnPartitions(final long word1, final long word2, final int nBits, final int nParts) {
        if (nBits == 32) {
            return word1 < word2 ? word2 - word1 : word1 - word2;
        } else {
            /* partition mask starts at the right */
            long mask = MASK_TABLE[nBits];
            long result = 0;
            for (int i = 1; i <= nParts; i++) {
                final long p1 = word1 & mask;
                final long p2 = word2 & mask;
                if (p1 < p2) {
                    /* result is really abs value of thedifference */
                    result = result | p2 - p1;
                } else {
                    result = result | p1 - p2;
                }
                /* slide left to next partition */
                mask = shl(mask, nBits);
            }
            return result;
        }
    }

    /*
     * Based on the values provided during setup choose and perform the appropriate inner loop
     * function.
     */

    /* BitBltSimulation>>#performCopyLoop */
    private void performCopyLoop() {
        destMaskAndPointerInit();
        if (noSource) {
            /* Simple fill loop */
            copyLoopNoSource();
        } else {
            /* Loop using source and dest */
            checkSourceOverlap();
            if (sourceDepth != destDepth || cmFlags != 0 || sourceMSB != destMSB) {
                /*
                 * If we must convert between pixel depths or use color lookups or swap pixels use
                 * the general version
                 */
                copyLoopPixMap();
            } else {
                /* Otherwise we simply copy pixels and can use a faster version */
                sourceSkewAndPointerInit();
                copyLoop();
            }
        }
    }

    /*
     * Pick nPix pixels starting at srcBitIndex from the source, map by the color map, and justify
     * them according to dstBitIndex in the resulting destWord.
     */

    /* BitBltSimulation>>#pickSourcePixels:flags:srcMask:destMask:srcShiftInc:dstShiftInc: */
    private long pickSourcePixelsflagssrcMaskdestMasksrcShiftIncdstShiftInc(final long nPixels, final long mapperFlags, final long srcMask, final long dstMask, final long srcShiftInc,
                    final long dstShiftInc) {
        long destWord = 0;
        /* Hint: Keep in register */
        long srcShift = srcBitShift;
        /* Hint: Keep in register */
        long dstShift = dstBitShift;
        /* always > 0 so we can use do { } while(--nPix); */
        long nPix = nPixels;
        if (mapperFlags == (COLOR_MAP_PRESENT | COLOR_MAP_INDEXED_PART)) {
            /* a little optimization for (pretty crucial) blits using indexed lookups only */
            /* grab, colormap and mix in pixel */
            do {
                final long sourceWord = srcLongAt(sourceIndex);
                final long sourcePix = shr(sourceWord, srcShift) & srcMask;
                final long destPix = cmLookupTable[(int) (sourcePix & cmMask)];
                /* adjust dest pix index */
                destWord = destWord | shl(destPix & dstMask, dstShift);
                /* adjust source pix index */
                dstShift += dstShiftInc;
                if (((srcShift += srcShiftInc) & 0xFFFFFFE0L) != 0) {
                    srcShift = sourceMSB ? srcShift + 32 : srcShift - 32;
                    /* begin incSrcIndex: */
                    sourceIndex += 4;
                }
            } while (--nPix > 0);
        } else {
            /* grab, colormap and mix in pixel */
            do {
                final long sourceWord = srcLongAt(sourceIndex);
                final long sourcePix = shr(sourceWord, srcShift) & srcMask;
                final long destPix = mapPixelflags(sourcePix, mapperFlags);
                /* adjust dest pix index */
                destWord = destWord | shl(destPix & dstMask, dstShift);
                /* adjust source pix index */
                dstShift += dstShiftInc;
                if (((srcShift += srcShiftInc) & 0xFFFFFFE0L) != 0) {
                    srcShift = sourceMSB ? srcShift + 32 : srcShift - 32;
                    /* begin incSrcIndex: */
                    sourceIndex += 4;
                }
            } while (--nPix > 0);
        }
        /* Store back */
        srcBitShift = srcShift;
        return destWord;
    }

    /*
     * Pick a single pixel from the source for WarpBlt. Note: This method is crucial for WarpBlt
     * speed w/o smoothing and still relatively important when smoothing is used.
     */

    /* BitBltSimulation>>#pickWarpPixelAtX:y: */
    private long pickWarpPixelAtXy(final long xx, final long yy) {
        /*
         * note: it would be much faster if we could just avoid these stupid tests for being inside
         * sourceForm.
         */
        final long x = xx >>> BINARY_POINT;
        final long y = yy >>> BINARY_POINT;
        if (xx < 0 || yy < 0 || x >= sourceWidth || y >= sourceHeight) {
            return 0;
        }
        final long srcIndex = y * sourcePitch + shr(x, warpAlignShift) * 4;
        /* Extract pixel from word */
        final long sourceWord = srcLongAt(srcIndex);
        srcBitShift = warpBitShiftTable[(int) (x & warpAlignMask)];
        return shr(sourceWord, srcBitShift) & warpSrcMask;
    }

    /*
     * Clear all pixels in destinationWord for which the pixels of sourceWord have the same values.
     * Used to clear areas of some constant color to zero.
     */

    /* BitBltSimulation>>#pixClear:with: */
    private long pixClearwith(final long sourceWord, final long destinationWord) {
        if (destDepth == 32) {
            if (sourceWord == destinationWord) {
                return 0;
            } else {
                return destinationWord;
            }
        }
        final int nBits = destDepth;
        /* partition mask starts at the right */
        long mask = MASK_TABLE[nBits];
        long result = 0;
        for (int i = 1; i <= destPPW; i++) {
            long pv = destinationWord & mask;
            if ((sourceWord & mask) == pv) {
                pv = 0;
            }
            result = result | pv;
            /* slide left to next partition */
            mask = shl(mask, nBits);
        }
        return result;
    }

    /* BitBltSimulation>>#pixMask:with: */
    private long pixMaskwith(final long sourceWord, final long destinationWord) {
        return partitionedANDtonBitsnPartitions(~sourceWord, destinationWord, destDepth, destPPW);
    }

    /* BitBltSimulation>>#pixPaint:with: */
    private long pixPaintwith(final long sourceWord, final long destinationWord) {
        if (sourceWord == 0) {
            return destinationWord;
        }
        return sourceWord | partitionedANDtonBitsnPartitions(~sourceWord, destinationWord, destDepth, destPPW);
    }

    /* Swap the pixels in destWord */

    /* BitBltSimulation>>#pixSwap:with: */
    private long pixSwapwith(@SuppressWarnings("unused") final long sourceWord, final long destWord) {
        if (destPPW == 1) {
            return destWord;
        }
        long result = 0;
        /* mask low pixel */
        long lowMask = shl(1, destDepth) - 1;
        /* mask high pixel */
        long highMask = shl(lowMask, (destPPW - 1) * destDepth);
        int shift = 32 - destDepth;
        result = result | shl(destWord & lowMask, shift) | shr(destWord & highMask, shift);
        if (destPPW <= 2) {
            return result;
        }
        for (int i = 2; i <= destPPW / 2; i++) {
            lowMask = shl(lowMask, destDepth);
            highMask = shr(highMask, destDepth);
            shift -= destDepth * 2;
            result = result | shl(destWord & lowMask, shift) | shr(destWord & highMask, shift);
        }
        return result;
    }

    /*
     * Invoke the copyBits primitive. If the destination is the display, then copy it to the screen.
     */

    /* BitBltSimulation>>#primitiveCopyBits */
    @TruffleBoundary(transferToInterpreterOnException = false)
    public long primitiveCopyBits(final PointersObject bbObj, final long factor) {
        if (!loadBitBltFromwarping(bbObj, false)) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        try {
            copyBits(factor);
            assert !failed();
            showDisplayBits();
            assert !failed();
        } catch (final AssertionError e) {
            PrimitiveFailed.andTransferToInterpreter();
            image.printToStdErr(e.getMessage());
        }
        if (combinationRule == 22 || combinationRule == 32) {
            return bitCount;
        } else {
            return -1; // return receiver
        }
    }

    /* BitBltSimulation>>#primitiveDisplayString */
    @TruffleBoundary(transferToInterpreterOnException = false)
    public void primitiveDisplayString(final PointersObject bbObj, final NativeObject sourceString, final long startIndex, final long stopIndex, final long[] glyphMap,
                    final long[] xTable, final int kernDelta) {
        /**
         * Most checks moved to guard of specialization in {@link PrimDisplayStringNode}.
         *
         * <pre>
         * if (!(slotSizeOf(glyphMap) == 256 && isBytes(sourceString) && startIndex > 0 && stopIndex >= 0 &&
         *              stopIndex <= sourceString.getByteLength() && loadBitBltFromwarping(bbObj, false) && combinationRule != 30 && combinationRule != 0x1F)) {
         * </pre>
         */
        if (!(loadBitBltFromwarping(bbObj, false) && combinationRule != 30 && combinationRule != 0x1F)) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        /**
         * Check moved to guard of specialization in {@link PrimDisplayStringNode}.
         *
         * <pre>
         * if (stopIndex == 0) {
         *     return bbObj;
         * }
         * </pre>
         */

        /* See if we can go directly into copyLoopPixMap (usually we can) */
        final long maxGlyph = xTable.length - 2;
        /* no point using slower version */
        final boolean quickBlt = destBits != null && sourceBits != null &&
                        !noSource && sourceForm != destForm && (cmFlags != 0 || sourceMSB != destMSB || sourceDepth != destDepth);
        if (quickBlt) {
            endOfSource = sourcePitch * sourceHeight;
            endOfDestination = destPitch * destHeight;
        } else {
            if (!lockSurfaces()) {
                PrimitiveFailed.andTransferToInterpreter();
            }
        }
        final int left = destX;
        final byte[] sourceStringBytes = sourceString.getByteStorage();
        for (int charIndex = (int) startIndex; charIndex <= stopIndex; charIndex++) {
            final int ascii = Byte.toUnsignedInt(sourceStringBytes[charIndex - 1]);
            final int glyphIndex = (int) glyphMap[ascii];
            if (glyphIndex < 0 || glyphIndex > maxGlyph) {
                PrimitiveFailed.andTransferToInterpreter();
            }
            sourceX = (int) xTable[glyphIndex];
            width = (int) (xTable[glyphIndex + 1] - sourceX);
            assert !failed();
            clipRange();
            if (bbW > 0 && bbH > 0) {
                if (quickBlt) {
                    destMaskAndPointerInit();
                    copyLoopPixMap();
                    affectedL = dx;
                    affectedR = dx + bbW;
                    affectedT = dy;
                    affectedB = dy + bbH;
                } else {
                    copyBitsLockedAndClipped();
                }
            }
            assert !failed();
            destX = destX + width + kernDelta;
        }
        affectedL = left;
        if (!quickBlt) {
            unlockSurfaces();
        }
        showDisplayBits();
        storeIntegerofObjectwithValue(BB_DEST_X_INDEX, bbObj, destX);
    }

    /* Invoke the line drawing primitive. */

    /* BitBltSimulation>>#primitiveDrawLoop */
    @TruffleBoundary(transferToInterpreterOnException = false)
    public void primitiveDrawLoop(final PointersObject bbObj, final long xDelta, final long yDelta) {
        if (!loadBitBltFromwarping(bbObj, false)) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        if (!failed()) {
            drawLoopXY(xDelta, yDelta);
            showDisplayBits();
        }
    }

    /*
     * returns the single pixel at x@y. It does not handle LSB bitmaps right now. If x or y are < 0,
     * return 0 to indicate transparent (cf BitBlt>bitPeekerFromForm: usage). Likewise if x>width or
     * y>depth. Fail if the rcvr doesn't seem to be a Form, or x|y seem wrong
     */

    /* BitBltSimulation>>#primitivePixelValueAtX:y: */
    @TruffleBoundary(transferToInterpreterOnException = false)
    public long primitivePixelValueAt(final PointersObject bbObj, final long xVal, final long yVal) {
        assert !(xVal < 0 || yVal < 0) : "Precondition not checked in guard";
        assert isPointers(bbObj) && slotSizeOf(bbObj) >= FORM.OFFSET : "Precondition not checked in guard";

        final NativeObject bitmap = fetchNativeofObjectOrNull(FORM.BITS, bbObj);
        if (!isWordsOrBytes(bitmap)) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        width = fetchIntegerofObject(FORM.WIDTH, bbObj);
        height = fetchIntegerofObject(FORM.HEIGHT, bbObj);
        /* if width/height/depth are not integer, fail */
        final int depth = fetchIntegerofObject(FORM.DEPTH, bbObj);
        assert !failed();
        if (xVal >= width || yVal >= height) {
            return 0L;
        }
        if (depth < 0) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        /* pixels in each word */
        final int ppW = div(32, depth);
        /* how many words per row of pixels */
        final int stride = div(width + ppW - 1, ppW);
        final int bitsSize;
        if (isWords(bitmap)) {
            bitsSize = bitmap.getIntLength() * Integer.SIZE;
        } else {
            bitsSize = bitmap.getByteLength();
        }
        if (bitsSize < stride * height * 4) {
            /* bytes per word */
            PrimitiveFailed.andTransferToInterpreter();
        }
        /* load the word that contains our target */
        final long index = yVal * stride + div(xVal, ppW);
        final long word;
        if (isWords(bitmap)) {
            word = Integer.toUnsignedLong(bitmap.getIntStorage()[(int) index]);
        } else {
            word = Integer.toUnsignedLong(UnsafeUtils.getInt(bitmap.getByteStorage(), index));
        }
        /* make a mask to isolate the pixel within that word */
        final long mask = shr(0xFFFFFFFFL, 32 - depth);
        /*
         * this is the tricky MSB part - we mask the xVal to find how far into the word we need,
         * then add 1 for the pixel we're looking for, then * depth to get the bit shift
         */
        final long shift = 32 - ((xVal & ppW - 1) + 1) * depth;
        /* shift, mask and dim the lights */
        return shr(word, shift) & mask;
    }

    /*
     * Invoke the warpBits primitive. If the destination is the display, then copy it to the screen.
     */

    /* BitBltSimulation>>#primitiveWarpBits */
    @TruffleBoundary(transferToInterpreterOnException = false)
    public void primitiveWarpBits(final PointersObject bbObj, final long n, final AbstractSqueakObject sourceMap) {
        if (!loadWarpBltFrom(bbObj)) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        warpBits(n, sourceMap);
        assert !failed();
        showDisplayBits();
        assert !failed();
    }

    /* BitBltSimulation>>#rgbAdd:with: */
    private long rgbAddwith(final long sourceWord, final long destinationWord) {
        if (destDepth < 16) {
            /* Add each pixel separately */
            final long componentMask = shl(1, destDepth) - 1;
            final long carryOverflowMask = shl(div(0xFFFFFFFFL, componentMask), destDepth - 1);
            return partitionedAddtonBitscomponentMaskcarryOverflowMask(sourceWord, destinationWord, destDepth, componentMask, carryOverflowMask);
        }
        if (destDepth == 16) {
            /* Add RGB components of each pixel separately */
            final long componentMask = 0x1F;
            final long carryOverflowMask = 1108361744;
            return partitionedAddtonBitscomponentMaskcarryOverflowMask(sourceWord & 2147450879, destinationWord & 2147450879, 5, componentMask, carryOverflowMask);
        } else {
            /* Add RGBA components of the pixel separately */
            final long componentMask = 0xFF;
            final long carryOverflowMask = 2155905152L;
            return partitionedAddtonBitscomponentMaskcarryOverflowMask(sourceWord, destinationWord, 8, componentMask, carryOverflowMask);
        }
    }

    /*
     * This version assumes combinationRule = 41 sourcePixSize = 32 destPixSize = 16 sourceForm ~=
     * destForm.
     */

    /* BitBltSimulation>>#rgbComponentAlpha16 */
    private void rgbComponentAlpha16() {
        /* So we can pre-decrement */
        int deltaY = bbH + 1;
        int srcY = sy;
        int dstY = dy;
        int srcShift = (dx & 1) * 16;
        if (destMSB) {
            srcShift = 16 - srcShift;
        }
        /* This is the outer loop */
        mask1 = shl(0xFFFF, 16 - srcShift);
        while (--deltaY > 0) {
            long srcIndex = srcY * sourcePitch + sx * 4;
            long dstIndex = dstY * destPitch + dx / 2 * 4;
            final int ditherBase = (dstY & 3) * 4;
            /* For pre-increment */
            int ditherIndex = (sx & 3) - 1;
            /* So we can pre-decrement */
            int deltaX = bbW + 1;
            long dstMask = mask1;
            if (dstMask == 0xFFFF) {
                srcShift = 16;
            } else {
                srcShift = 0;
            }
            while (--deltaX > 0) {
                final int ditherThreshold = DITHER_MATRIX_4X4[ditherBase + (ditherIndex = ditherIndex + 1 & 3)];
                long sourceWord = srcLongAt(srcIndex);
                final long srcAlpha = sourceWord & 0xFFFFFF;
                if (srcAlpha != 0) {
                    /* 0 < srcAlpha */
                    /* If we have to mix colors then just copy a single word */
                    /* begin dstLongAt: */
                    long destWord = dstLongAt(dstIndex);
                    destWord = destWord & ~dstMask;
                    /* Expand from 16 to 32 bit by adding zero bits */
                    destWord = shr(destWord, srcShift);
                    /* Mix colors */
                    destWord = (destWord & 0x7C00) << 9 | (destWord & 0x3E0) << 6 | (destWord & 0x1F) << 3 | 0xFF000000L;
                    /* And dither */
                    sourceWord = rgbComponentAlpha32with(sourceWord, destWord);
                    sourceWord = dither32To16threshold(sourceWord, ditherThreshold);
                    if (sourceWord == 0) {
                        sourceWord = shl(1, srcShift);
                    } else {
                        sourceWord = shl(sourceWord, srcShift);
                    }
                    destLongAtputmask(dstIndex, dstMask, sourceWord);
                }
                srcIndex += 4;
                if (destMSB) {
                    if (srcShift == 0) {
                        dstIndex += 4;
                    }
                } else {
                    if (srcShift != 0) {
                        dstIndex += 4;
                    }
                }
                /* Toggle between 0 and 16 */
                srcShift = srcShift ^ 16;
                dstMask = ~dstMask;
            }
            srcY++;
            dstY++;
        }
    }

    /*
     * This version assumes combinationRule = 41 sourcePixSize = destPixSize = 32 sourceForm ~=
     * destForm. Note: The inner loop has been optimized for dealing with the special case of aR =
     * aG = aB = 0
     */

    /* BitBltSimulation>>#rgbComponentAlpha32 */
    private void rgbComponentAlpha32() {
        /* This particular method should be optimized in itself */
        /* Give the compile a couple of hints */
        /*
         * The following should be declared as pointers so the compiler will notice that they're
         * used for accessing memory locations (good to know on an Intel architecture) but then the
         * increments would be different between ST code and C code so must hope the compiler
         * notices what happens (MS Visual C does)
         */

        /* So we can pre-decrement */
        int deltaY = bbH + 1;
        int srcY = sy;
        /* This is the outer loop */
        int dstY = dy;
        while (--deltaY > 0) {
            long srcIndex = srcY * sourcePitch + sx * 4;
            long dstIndex = dstY * destPitch + dx * 4;
            /* So we can pre-decrement */
            /* This is the inner loop */
            long deltaX = bbW + 1;
            while (--deltaX != 0) {
                long sourceWord = srcLongAt(srcIndex);
                final long srcAlpha = sourceWord & 0xFFFFFF;
                if (srcAlpha == 0) {
                    srcIndex += 4;
                    /* Now skip as many words as possible, */
                    dstIndex += 4;
                    while (--deltaX != 0 && ((sourceWord = srcLongAt(srcIndex)) & 0xFFFFFF) == 0) {
                        srcIndex += 4;
                        dstIndex += 4;
                    }
                    deltaX++;
                } else {
                    /* 0 < srcAlpha */
                    /* If we have to mix colors then just copy a single word */
                    /* begin dstLongAt: */
                    final long destWord = rgbComponentAlpha32with(sourceWord, dstLongAt(dstIndex));
                    /* begin dstLongAt:put: */
                    dstLongAtput(dstIndex, destWord);
                    srcIndex += 4;
                    dstIndex += 4;
                }
            }
            srcY++;
            dstY++;
        }
    }

    /*
     * componentAlphaModeColor is the color, sourceWord contains an alpha value for each component
     * of RGB each of which is encoded as0 meaning 0.0 and 255 meaning 1.0 . the rule is...
     *
     * color = componentAlphaModeColor. colorAlpha = componentAlphaModeAlpha. mask = sourceWord.
     * dst.A = colorAlpha + (1 - colorAlpha) * dst.A dst.R = color.R * mask.R * colorAlpha + (1 -
     * (mask.R * colorAlpha)) * dst.R dst.G = color.G * mask.G * colorAlpha + (1 - (mask.G*
     * colorAlpha)) * dst.G dst.B = color.B * mask.B * colorAlpha + (1 - (mask.B* colorAlpha)) *
     * dst.B
     */

    /* BitBltSimulation>>#rgbComponentAlpha32:with: */
    private long rgbComponentAlpha32with(final long sourceWord, final long destinationWord) {
        long alpha = sourceWord;
        if (alpha == 0) {
            return destinationWord;
        }
        long srcColor = componentAlphaModeColor;
        final long srcAlpha = componentAlphaModeAlpha & 0xFF;
        long aB = alpha & 0xFF;
        alpha = alpha >>> 8;
        long aG = alpha & 0xFF;
        alpha = alpha >>> 8;
        long aR = alpha & 0xFF;
        alpha = alpha >>> 8;
        long aA = alpha & 0xFF;
        if (srcAlpha != 0xFF) {
            aA = aA * srcAlpha >>> 8;
            aR = aR * srcAlpha >>> 8;
            aG = aG * srcAlpha >>> 8;
            aB = aB * srcAlpha >>> 8;
        }
        long dstMask = destinationWord;
        long d = dstMask & 0xFF;
        long s = srcColor & 0xFF;
        if (ungammaLookupTable != null) {
            d = ungammaLookupTable[(int) d];
            s = ungammaLookupTable[(int) s];
        }
        long b = (d * (0xFF - aB) >>> 8) + (s * aB >>> 8);
        if (b > 0xFF) {
            b = 0xFF;
        }
        if (gammaLookupTable != null) {
            b = gammaLookupTable[(int) b];
        }
        dstMask = dstMask >>> 8;
        srcColor = srcColor >>> 8;
        d = dstMask & 0xFF;
        s = srcColor & 0xFF;
        if (ungammaLookupTable != null) {
            d = ungammaLookupTable[(int) d];
            s = ungammaLookupTable[(int) s];
        }
        long g = (d * (0xFF - aG) >>> 8) + (s * aG >>> 8);
        if (g > 0xFF) {
            g = 0xFF;
        }
        if (gammaLookupTable != null) {
            g = gammaLookupTable[(int) g];
        }
        dstMask = dstMask >>> 8;
        srcColor = srcColor >>> 8;
        d = dstMask & 0xFF;
        s = srcColor & 0xFF;
        if (ungammaLookupTable != null) {
            d = ungammaLookupTable[(int) d];
            s = ungammaLookupTable[(int) s];
        }
        long r = (d * (0xFF - aR) >>> 8) + (s * aR >>> 8);
        if (r > 0xFF) {
            r = 0xFF;
        }
        if (gammaLookupTable != null) {
            r = gammaLookupTable[(int) r];
        }
        dstMask = dstMask >>> 8;
        /* no need to gamma correct alpha value ? */
        long a = ((dstMask & 0xFF) * (0xFF - aA) >>> 8) + aA;
        if (a > 0xFF) {
            a = 0xFF;
        }
        return (((a << 8) + r << 8) + g << 8) + b;
    }

    /*
     * This version assumes combinationRule = 41 sourcePixSize = 32 destPixSize = 8 sourceForm ~=
     * destForm. Note: This is not real blending since we don't have the source colors available.
     */

    /* BitBltSimulation>>#rgbComponentAlpha8 */
    private void rgbComponentAlpha8() {
        /* This particular method should be optimized in itself */
        final long[] mappingTable = DEFAULT_8_TO_32_TABLE;
        final long mapperFlags = cmFlags & ~COLOR_MAP_NEW_STYLE;
        /* So we can pre-decrement */
        int deltaY = bbH + 1;
        int srcY = sy;
        int dstY = dy;
        mask1 = (dx & 3) * 8;
        if (destMSB) {
            mask1 = 24 - mask1;
        }
        mask2 = ALL_ONES ^ shl(0xFF, mask1);
        long adjust;
        if ((dx & 1) == 0) {
            adjust = 0;
        } else {
            adjust = 522133279;
        }
        if ((dy & 1) == 0) {
            adjust = adjust ^ 522133279;
        }
        while (--deltaY != 0) {
            adjust = adjust ^ 522133279;
            long srcIndex = srcY * sourcePitch + sx * 4;
            long dstIndex = dstY * destPitch + dx / 4 * 4;
            /* So we can pre-decrement */
            int deltaX = bbW + 1;
            long srcShift = mask1;
            /* This is the inner loop */
            long dstMask = mask2;
            while (--deltaX != 0) {
                long sourceWord = (srcLongAt(srcIndex) & ~adjust) + adjust;
                /* set srcAlpha to the average of the 3 separate aR,Ag,AB values */
                long srcAlpha = sourceWord & 0xFFFFFF;
                srcAlpha = div((srcAlpha >>> 16) + (srcAlpha >>> 8 & 0xFF) + (srcAlpha & 0xFF), 3);
                if (srcAlpha > 0x1F) {
                    /* Everything below 31 is transparent */
                    if (srcAlpha > 224) {
                        /* treat everything above 224 as opaque */
                        sourceWord = 0xFFFFFFFFL;
                    }
                    /* begin dstLongAt: */
                    long destWord = dstLongAt(dstIndex);
                    destWord = destWord & ~dstMask;
                    destWord = shr(destWord, srcShift);
                    destWord = mappingTable[(int) destWord];
                    sourceWord = rgbComponentAlpha32with(sourceWord, destWord);
                    sourceWord = mapPixelflags(sourceWord, mapperFlags);
                    /* Store back */
                    sourceWord = shl(sourceWord, srcShift);
                    destLongAtputmask(dstIndex, dstMask, sourceWord);
                }
                srcIndex += 4;
                if (destMSB) {
                    if (srcShift == 0) {
                        dstIndex += 4;
                        srcShift = 24;
                        dstMask = 0xFFFFFF;
                    } else {
                        srcShift -= 8;
                        dstMask = dstMask >>> 8 | 0xFF000000L;
                    }
                } else {
                    if (srcShift == 32) {
                        dstIndex += 4;
                        srcShift = 0;
                        dstMask = 0xFFFFFF00L;
                    } else {
                        srcShift += 8;
                        dstMask = dstMask << 8 | 0xFF;
                    }
                }
                adjust = adjust ^ 522133279;
            }
            srcY++;
            dstY++;
        }
    }

    /*
     * componentAlphaModeColor is the color, sourceWord contains an alpha value for each component
     * of RGB each of which is encoded as0 meaning 0.0 and 255 meaning 1.0 . the rule is...
     *
     * color = componentAlphaModeColor. colorAlpha = componentAlphaModeAlpha. mask = sourceWord.
     * dst.A = colorAlpha + (1 - colorAlpha) * dst.A dst.R = color.R * mask.R * colorAlpha + (1 -
     * (mask.R * colorAlpha)) * dst.R dst.G = color.G * mask.G * colorAlpha + (1 - (mask.G*
     * colorAlpha)) * dst.G dst.B = color.B * mask.B * colorAlpha + (1 - (mask.B* colorAlpha)) *
     * dst.B
     */

    /* BitBltSimulation>>#rgbComponentAlpha:with: */
    private long rgbComponentAlphawith(final long sourceWord, final long destinationWord) {
        final long alpha = sourceWord;
        if (alpha == 0) {
            return destinationWord;
        }
        return partitionedRgbComponentAlphadestnBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
    }

    /*
     * Subtract the pixels in the source and destination, color by color, and return the sum of the
     * absolute value of all the differences. For non-rgb, return the number of differing pixels.
     */

    /* BitBltSimulation>>#rgbDiff:with: */
    private long rgbDiffwith(final long sourceWord, final long destinationWord) {
        final long pixMask = MASK_TABLE[destDepth];
        final int bitsPerColor;
        final long rgbMask;
        if (destDepth == 16) {
            bitsPerColor = 5;
            rgbMask = 0x1F;
        } else {
            bitsPerColor = 8;
            rgbMask = 0xFF;
        }
        long maskShifted = destMask;
        long destShifted = destinationWord;
        long sourceShifted = sourceWord;
        for (int i = 1; i <= destPPW; i++) {
            if ((maskShifted & pixMask) > 0) {
                /* Only tally pixels within the destination rectangle */
                final long destPixVal = destShifted & pixMask;
                final long sourcePixVal = sourceShifted & pixMask;
                long diff;
                if (destDepth < 16) {
                    if (sourcePixVal == destPixVal) {
                        diff = 0;
                    } else {
                        diff = 1;
                    }
                } else {
                    diff = partitionedSubfromnBitsnPartitions(sourcePixVal, destPixVal, bitsPerColor, 3);
                    diff = (diff & rgbMask) + (diff >>> bitsPerColor & rgbMask) + (diff >>> bitsPerColor >>> bitsPerColor & rgbMask);
                }
                bitCount += diff;
            }
            maskShifted = maskShifted >>> destDepth;
            sourceShifted = sourceShifted >>> destDepth;
            destShifted = destShifted >>> destDepth;
        }
        return destinationWord;
    }

    /*
     * Convert the given 16bit pixel value to a 32bit RGBA value. Note: This method is intended to
     * deal with different source formats.
     */

    /* BitBltSimulation>>#rgbMap16To32: */
    private static long rgbMap16To32(final long sourcePixel) {
        return (sourcePixel & 0x1F) << 3 | (sourcePixel & 0x3E0) << 6 | (sourcePixel & 0x7C00) << 9;
    }

    /*
     * Convert the given 32bit pixel value to a 32bit RGBA value. Note: This method is intended to
     * deal with different source formats.
     */

    /* BitBltSimulation>>#rgbMap32To32: */
    private static long rgbMap32To32(final long sourcePixel) {
        return sourcePixel;
    }

    /* Perform the RGBA conversion for the given source pixel */

    /* BitBltSimulation>>#rgbMapPixel:flags: */
    private long rgbMapPixelflags(final long sourcePixel) {
        return shift(sourcePixel & cmMaskTable[0], cmShiftTable[0]) | //
                        shift(sourcePixel & cmMaskTable[1], cmShiftTable[1]) | //
                        shift(sourcePixel & cmMaskTable[2], cmShiftTable[2]) | //
                        shift(sourcePixel & cmMaskTable[3], cmShiftTable[3]);
    }

    /*
     * Convert the given pixel value with nBitsIn bits for each color component to a pixel value
     * with nBitsOut bits for each color component. Typical values for nBitsIn/nBitsOut are 3, 5, or
     * 8.
     */

    /* BitBltSimulation>>#rgbMap:from:to: */
    private static long rgbMapfromto(final long sourcePixel, final long nBitsIn, final long nBitsOut) {
        long d = nBitsOut - nBitsIn;
        if (d > 0) {
            /* Expand to more bits by zero-fill */
            /* Transfer mask */
            long mask = shl(1, nBitsIn) - 1;
            long srcPix = shl(sourcePixel, d);
            mask = shl(mask, d);
            final long destPix = srcPix & mask;
            mask = shl(mask, nBitsOut);
            srcPix = shl(srcPix, d);
            return destPix + (srcPix & mask) + (shl(srcPix, d) & shl(mask, nBitsOut));
        } else {
            /* Compress to fewer bits by truncation */
            if (d == 0) {
                if (nBitsIn == 5) {
                    /*
                     * Sometimes called with 16 bits, though pixel is 15, but we must never return
                     * more than 15.
                     */
                    return sourcePixel & 0x7FFF;
                }
                if (nBitsIn == 8) {
                    /*
                     * Sometimes called with 32 bits, though pixel is 24, but we must never return
                     * more than 24.
                     */
                    return sourcePixel & 0xFFFFFF;
                }
                return sourcePixel;
            }
            if (sourcePixel == 0) {
                return sourcePixel;
            }
            d = nBitsIn - nBitsOut;
            /* Transfer mask */
            long mask = shl(1, nBitsOut) - 1;
            long srcPix = shr(sourcePixel, d);
            long destPix = srcPix & mask;
            mask = shl(mask, nBitsOut);
            srcPix = shr(srcPix, d);
            destPix = destPix + (srcPix & mask) + (shr(srcPix, d) & shl(mask, nBitsOut));
            if (destPix == 0) {
                return 1L;
            }
            return destPix;
        }
    }

    /* BitBltSimulation>>#rgbMax:with: */
    private long rgbMaxwith(final long sourceWord, final long destinationWord) {
        if (destDepth < 16) {
            /* Max each pixel separately */
            return partitionedMaxwithnBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
        }
        if (destDepth == 16) {
            /* Max RGB components of each pixel separately */
            return partitionedMaxwithnBitsnPartitions(sourceWord, destinationWord, 5, 3) +
                            (partitionedMaxwithnBitsnPartitions(sourceWord >>> 16, destinationWord >>> 16, 5, 3) << 16);
        } else {
            /* Max RGBA components of the pixel separately */
            return partitionedMaxwithnBitsnPartitions(sourceWord, destinationWord, 8, 4);
        }
    }

    /* BitBltSimulation>>#rgbMinInvert:with: */
    private long rgbMinInvertwith(final long wordToInvert, final long destinationWord) {
        final long sourceWord = ~wordToInvert;
        if (destDepth < 16) {
            /* Min each pixel separately */
            return partitionedMinwithnBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
        }
        if (destDepth == 16) {
            /* Min RGB components of each pixel separately */
            return partitionedMinwithnBitsnPartitions(sourceWord, destinationWord, 5, 3) +
                            (partitionedMinwithnBitsnPartitions(sourceWord >>> 16, destinationWord >>> 16, 5, 3) << 16);
        } else {
            /* Min RGBA components of the pixel separately */
            return partitionedMinwithnBitsnPartitions(sourceWord, destinationWord, 8, 4);
        }
    }

    /* BitBltSimulation>>#rgbMin:with: */
    private long rgbMinwith(final long sourceWord, final long destinationWord) {
        if (destDepth < 16) {
            /* Min each pixel separately */
            return partitionedMinwithnBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
        }
        if (destDepth == 16) {
            /* Min RGB components of each pixel separately */
            return partitionedMinwithnBitsnPartitions(sourceWord, destinationWord, 5, 3) +
                            (partitionedMinwithnBitsnPartitions(sourceWord >>> 16, destinationWord >>> 16, 5, 3) << 16);
        } else {
            /* Min RGBA components of the pixel separately */
            return partitionedMinwithnBitsnPartitions(sourceWord, destinationWord, 8, 4);
        }
    }

    /* BitBltSimulation>>#rgbMul:with: */
    private long rgbMulwith(final long sourceWord, final long destinationWord) {
        if (destDepth < 16) {
            /* Mul each pixel separately */
            return partitionedMulwithnBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
        }
        if (destDepth == 16) {
            /* Mul RGB components of each pixel separately */
            return partitionedMulwithnBitsnPartitions(sourceWord, destinationWord, 5, 3) +
                            (partitionedMulwithnBitsnPartitions(sourceWord >>> 16, destinationWord >>> 16, 5, 3) << 16);
        } else {
            /* Mul RGBA components of the pixel separately */
            return partitionedMulwithnBitsnPartitions(sourceWord, destinationWord, 8, 4);
        }
    }

    /* BitBltSimulation>>#rgbSub:with: */
    private long rgbSubwith(final long sourceWord, final long destinationWord) {
        if (destDepth < 16) {
            /* Sub each pixel separately */
            return partitionedSubfromnBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
        }
        if (destDepth == 16) {
            /* Sub RGB components of each pixel separately */
            return partitionedSubfromnBitsnPartitions(sourceWord, destinationWord, 5, 3) +
                            (partitionedSubfromnBitsnPartitions(sourceWord >>> 16, destinationWord >>> 16, 5, 3) << 16);
        } else {
            /* Sub RGBA components of the pixel separately */
            return partitionedSubfromnBitsnPartitions(sourceWord, destinationWord, 8, 4);
        }
    }

    /* WARNING: For WarpBlt w/ smoothing the source depth is wrong here! */

    /* BitBltSimulation>>#setupColorMasks */
    private void setupColorMasks() {
        long bits = 0;
        long targetBits = 0;
        if (sourceDepth <= 8) {
            return;
        }
        if (sourceDepth == 16) {
            bits = 5;
        }
        if (sourceDepth == 32) {
            bits = 8;
        }
        if (cmBitsPerColor == 0) {
            /* Convert to destDepth */
            if (destDepth <= 8) {
                return;
            }
            if (destDepth == 16) {
                targetBits = 5;
            }
            if (destDepth == 32) {
                targetBits = 8;
            }
        } else {
            targetBits = cmBitsPerColor;
        }
        setupColorMasksFromto(bits, targetBits);
    }

    /*
     * Setup color masks for converting an incoming RGB pixel value from srcBits to targetBits.
     */

    /* BitBltSimulation>>#setupColorMasksFrom:to: */
    private void setupColorMasksFromto(final long srcBits, final long targetBits) {
        final int[] masks = cmMaskTableTemplate;
        final int[] shifts = cmShiftTableTemplate;
        final int deltaBits = (int) (targetBits - srcBits);
        if (deltaBits == 0) {
            return;
        }
        if (deltaBits <= 0) {
            /* Mask for extracting a color part of the source */
            final long mask = shl(1, targetBits) - 1;
            masks[RED_INDEX] = (int) shl(mask, srcBits * 2 - deltaBits);
            masks[GREEN_INDEX] = (int) shl(mask, srcBits - deltaBits);
            masks[BLUE_INDEX] = (int) shl(mask, 0 - deltaBits);
            // masks[ALPHA_INDEX] = 0; // Always zero anyway.
        } else {
            /* Mask for extracting a color part of the source */
            final long mask = shl(1, srcBits) - 1;
            masks[RED_INDEX] = (int) shl(mask, srcBits * 2);
            masks[GREEN_INDEX] = (int) shl(mask, srcBits);
            masks[BLUE_INDEX] = (int) mask;
        }
        shifts[RED_INDEX] = deltaBits * 3;
        shifts[GREEN_INDEX] = deltaBits * 2;
        shifts[BLUE_INDEX] = deltaBits;
        // shifts[ALPHA_INDEX] = 0; // Always zero anyway.
        cmShiftTable = shifts;
        cmMaskTable = masks;
        cmFlags = cmFlags | COLOR_MAP_PRESENT | COLOR_MAP_FIXED_PART;
    }

    /* BitBltSimulation>>#showDisplayBits */
    private void showDisplayBits() {
        if (image.hasDisplay()) {
            image.getDisplay().showDisplayBitsLeftTopRightBottom(destForm, affectedL, affectedT, affectedR, affectedB);
        }
    }

    /*
     * This is only used when source and dest are same depth, ie, when the barrel-shift copy loop is
     * used.
     */

    /* BitBltSimulation>>#sourceSkewAndPointerInit (modified, copied from SqueakJS) */
    private void sourceSkewAndPointerInit() {
        assert destPPW == sourcePPW && destMSB == sourceMSB && destDepth == sourceDepth;
        /* A mask, assuming power of two */
        final int pixPerM11 = destPPW - 1;
        final int sxLowBits = sx & pixPerM11;
        /* how many pixels in first word */
        final int dxLowBits = dx & pixPerM11;
        final int startBits1 = hDir > 0 ? sourcePPW - (sx & pixPerM11) : (sx & pixPerM11) + 1;
        final long m1 = destMSB == hDir > 0 ? ALL_ONES >> 32 - startBits1 * destDepth : ALL_ONES << 32 - startBits1 * destDepth & ALL_ONES;
        /* i.e. there are some missing bits */
        /* calculate right-shift skew from source to dest */
        preload = bbW > startBits1 && (m1 & mask1) != mask1;
        /* -32..32 */
        skew = destDepth * (sourceMSB ? sxLowBits - dxLowBits : dxLowBits - sxLowBits);
        if (preload) {
            skew = skew < 0 ? skew + 32 : skew - 32;
        }
        /* calculate increments from end of 1 line to start of next */
        sourceIndex = sy * sourcePitch + sx / sourcePPW * 4;
        sourceDelta = sourcePitch * vDir - 4 * (nWords * hDir);
        if (preload) {
            /* Compensate for extra source word fetched */
            sourceDelta -= 4 * hDir;
        }
        assert !(preload && skew == 0);
        assert -32 <= skew && skew <= 32; // Modified (image uses 31 instead of 32).
    }

    /* BitBltSimulation>>#sourceWord:with: */
    private static long sourceWordwith(final long sourceWord, @SuppressWarnings("unused") final long destinationWord) {
        return sourceWord;
    }

    /* BitBltSimulation>>#subWord:with: */
    private static long subWordwith(final long sourceWord, final long destinationWord) {
        return sourceWord - destinationWord;
    }

    /*
     * Tally pixels into the color map. Those tallied are exactly those in the destination
     * rectangle. Note that the source should be specified == destination, in order for the proper
     * color map checks to be performed at setup.
     */

    /* BitBltSimulation>>#tallyIntoMap:with: */
    private long tallyIntoMapwith(@SuppressWarnings("unused") final long sourceWord, final long destinationWord) {
        if ((cmFlags & (COLOR_MAP_PRESENT | COLOR_MAP_INDEXED_PART)) != (COLOR_MAP_PRESENT | COLOR_MAP_INDEXED_PART)) {
            return destinationWord;
        }
        final long pixMask = MASK_TABLE[destDepth];
        long destShifted = destinationWord;
        long maskShifted = destMask;
        for (int i = 1; i <= destPPW; i++) {
            if ((maskShifted & pixMask) != 0) {
                /* Only tally pixels within the destination rectangle */
                final long pixVal = destShifted & pixMask;
                final long mapIndex;
                if (destDepth < 16) {
                    mapIndex = pixVal;
                } else {
                    if (destDepth == 16) {
                        mapIndex = rgbMapfromto(pixVal, 5, cmBitsPerColor);
                    } else {
                        mapIndex = rgbMapfromto(pixVal, 8, cmBitsPerColor);
                    }
                }
                tallyMapAtput(mapIndex);
            }
            maskShifted = maskShifted >>> destDepth;
            destShifted = destShifted >>> destDepth;
        }
        return destinationWord;
    }

    private void tallyMapAtput(final long mapIndex) {
        cmLookupTable[(int) (mapIndex & cmMask)] += 1;
    }

    /*
     * Shortcut for stuff that's being run from the balloon engine. Since we do this at each scan
     * line we should avoid the expensive setup for source and destination.
     */
    /* We need a source. */

    /* BitBltSimulation>>#tryCopyingBitsQuickly */
    private boolean tryCopyingBitsQuickly() {
        if (noSource) {
            return false;
        }
        if (!(combinationRule == 34 || combinationRule == 41)) {
            return false;
        }
        if (sourceDepth != 32) {
            return false;
        }
        if (sourceForm == destForm) {
            return false;
        }
        if (combinationRule == 41) {
            if (destDepth == 32) {
                rgbComponentAlpha32();
                affectedL = dx;
                affectedR = dx + bbW;
                affectedT = dy;
                affectedB = dy + bbH;
                return true;
            }
            if (destDepth == 16) {
                rgbComponentAlpha16();
                affectedL = dx;
                affectedR = dx + bbW;
                affectedT = dy;
                affectedB = dy + bbH;
                return true;
            }
            if (destDepth == 8) {
                rgbComponentAlpha8();
                affectedL = dx;
                affectedR = dx + bbW;
                affectedT = dy;
                affectedB = dy + bbH;
                return true;
            }
            return false;
        }
        if (destDepth < 8) {
            return false;
        }
        if (destDepth == 8 && (cmFlags & COLOR_MAP_PRESENT) == 0) {
            return false;
        }
        if (destDepth == 32) {
            alphaSourceBlendBits32();
        }
        if (destDepth == 16) {
            alphaSourceBlendBits16();
        }
        if (destDepth == 8) {
            alphaSourceBlendBits8();
        }
        affectedL = dx;
        affectedR = dx + bbW;
        affectedT = dy;
        affectedB = dy + bbH;
        return true;
    }

    /* BitBltSimulation>>#unlockSurfaces */
    private void unlockSurfaces() {
        // Actual unlocking code not needed for TruffleSqueak.
    }

    /* BitBltSimulation>>#warpBits */
    private void warpBits(final long smoothingCount, final AbstractSqueakObject sourceMap) {
        final boolean ns = noSource;
        noSource = true;
        clipRange();
        noSource = ns;
        if (noSource || bbW <= 0 || bbH <= 0) {
            /* zero width or height; noop */
            affectedL = affectedR = affectedT = affectedB = 0;
            return;
        }
        if (!lockSurfaces()) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        destMaskAndPointerInit();
        warpLoop(smoothingCount, sourceMap);
        if (hDir > 0) {
            affectedL = dx;
            affectedR = dx + bbW;
        } else {
            affectedL = dx - bbW + 1;
            affectedR = dx + 1;
        }
        if (vDir > 0) {
            affectedT = dy;
            affectedB = dy + bbH;
        } else {
            affectedT = dy - bbH + 1;
            affectedB = dy + 1;
        }
        unlockSurfaces();
    }

    /*
     * This version of the inner loop traverses an arbirary quadrilateral source, thus producing a
     * general affine transformation.
     */

    /* BitBltSimulation>>#warpLoop */
    private void warpLoop(final long smoothingCountValue, final AbstractSqueakObject sourceMapOopValue) {
        long halftoneWord = 0;
        final LongBinaryOperator mergeFnwith = opTable[combinationRule + 1];
        if (slotSizeOf(bitBltOop) < BB_WARP_BASE + 12) {
            PrimitiveFailed.andTransferToInterpreter();
        }
        int nSteps = height - 1;
        if (nSteps <= 0) {
            nSteps = 1;
        }
        int pAx = fetchIntOrFloatofObject(BB_WARP_BASE, bitBltOop);
        int words = fetchIntOrFloatofObject(BB_WARP_BASE + 3, bitBltOop);
        final int deltaP12x = deltaFromtonSteps(pAx, words, nSteps);
        if (deltaP12x < 0) {
            pAx = words - nSteps * deltaP12x;
        }
        int pAy = fetchIntOrFloatofObject(BB_WARP_BASE + 1, bitBltOop);
        words = fetchIntOrFloatofObject(BB_WARP_BASE + 4, bitBltOop);
        final int deltaP12y = deltaFromtonSteps(pAy, words, nSteps);
        if (deltaP12y < 0) {
            pAy = words - nSteps * deltaP12y;
        }
        int pBx = fetchIntOrFloatofObject(BB_WARP_BASE + 9, bitBltOop);
        words = fetchIntOrFloatofObject(BB_WARP_BASE + 6, bitBltOop);
        final int deltaP43x = deltaFromtonSteps(pBx, words, nSteps);
        if (deltaP43x < 0) {
            pBx = words - nSteps * deltaP43x;
        }
        int pBy = fetchIntOrFloatofObject(BB_WARP_BASE + 10, bitBltOop);
        words = fetchIntOrFloatofObject(BB_WARP_BASE + 7, bitBltOop);
        final int deltaP43y = deltaFromtonSteps(pBy, words, nSteps);
        if (deltaP43y < 0) {
            pBy = words - nSteps * deltaP43y;
        }
        if (failed()) {
            return;
        }
        final long smoothingCount;
        final Object sourceMap;
        final boolean sourceMapIsWords;
        if (sourceMapOopValue != null) {
            smoothingCount = smoothingCountValue;
            if (sourceMapOopValue == NilObject.SINGLETON) {
                if (sourceDepth < 16) {
                    /* color map is required to smooth non-RGB dest */
                    PrimitiveFailed.andTransferToInterpreter();
                }
                sourceMap = null;
                sourceMapIsWords = false;
            } else {
                final NativeObject sourceMapNative = (NativeObject) sourceMapOopValue;
                final int sourceMapSize;
                if (sourceMapNative.isIntType()) {
                    sourceMapIsWords = true;
                    final int[] ints = sourceMapNative.getIntStorage();
                    sourceMap = ints;
                    sourceMapSize = ints.length * Integer.BYTES;
                } else {
                    sourceMapIsWords = false;
                    final byte[] bytes = sourceMapNative.getByteStorage();
                    sourceMap = bytes;
                    sourceMapSize = bytes.length;
                }
                if (sourceMapSize < shl(1, sourceDepth)) {
                    /* sourceMap must be long enough for sourceDepth */
                    PrimitiveFailed.andTransferToInterpreter();
                }
            }
        } else {
            smoothingCount = 1;
            sourceMap = null;
            sourceMapIsWords = false;
        }
        nSteps = width - 1;
        if (nSteps <= 0) {
            nSteps = 1;
        }
        int startBits = destPPW - (dx & destPPW - 1);
        final int endBits = (dx + bbW - 1 & destPPW - 1) + 1;
        if (bbW < startBits) {
            startBits = bbW;
        }
        if (destY < clipY) {
            /* Advance increments if there was clipping in y */
            pAx += (clipY - destY) * deltaP12x;
            pAy += (clipY - destY) * deltaP12y;
            pBx += (clipY - destY) * deltaP43x;
            pBy += (clipY - destY) * deltaP43y;
        }
        warpLoopSetup();
        if (smoothingCount > 1 && (cmFlags & COLOR_MAP_NEW_STYLE) == 0) {
            if (cmLookupTable == null) {
                if (destDepth == 16) {
                    setupColorMasksFromto(8, 5);
                }
            } else {
                setupColorMasksFromto(8, cmBitsPerColor);
            }
        }
        final long mapperFlags = cmFlags & ~COLOR_MAP_NEW_STYLE;
        final int dstShiftInc;
        final int dstShiftLeft;
        if (destMSB) {
            dstShiftInc = 0 - destDepth;
            dstShiftLeft = 32 - destDepth;
        } else {
            dstShiftInc = destDepth;
            dstShiftLeft = 0;
        }
        if (noHalftone) {
            halftoneWord = ALL_ONES;
        }
        for (int i = 1; i <= bbH; i++) {
            /* here is the vertical loop... */
            final int xDelta = deltaFromtonSteps(pAx, pBx, nSteps);
            if (xDelta >= 0) {
                sx = pAx;
            } else {
                sx = pBx - nSteps * xDelta;
            }
            final int yDelta = deltaFromtonSteps(pAy, pBy, nSteps);
            if (yDelta >= 0) {
                sy = pAy;
            } else {
                sy = pBy - nSteps * yDelta;
            }
            if (destMSB) {
                dstBitShift = 32 - ((dx & destPPW - 1) + 1) * destDepth;
            } else {
                dstBitShift = (dx & destPPW - 1) * destDepth;
            }
            if (destX < clipX) {
                /* Advance increments if there was clipping in x */
                sx += (clipX - destX) * xDelta;
                sy += (clipX - destX) * yDelta;
            }
            if (!noHalftone) {
                halftoneWord = halftoneLongAt(dy + i - 1);
            }
            destMask = mask1;
            /* Here is the inner loop... */
            int nPix = startBits;
            words = nWords;
            do {
                final long skewWord;
                if (smoothingCount == 1) {
                    /* Faster if not smoothing */
                    skewWord = warpPickSourcePixelsxDeltahyDeltahxDeltavyDeltavdstShiftIncflags(nPix, xDelta, yDelta, dstShiftInc, mapperFlags);
                } else {
                    /* more difficult with smoothing */
                    skewWord = warpPickSmoothPixelsxDeltahyDeltahxDeltavyDeltavsourceMapsmoothingdstShiftInc(nPix, xDelta, yDelta, deltaP12x, deltaP12y, sourceMap, sourceMapIsWords, smoothingCount,
                                    dstShiftInc);
                }
                dstBitShift = dstShiftLeft;
                if (destMask == ALL_ONES) {
                    /* avoid read-modify-write */
                    final long mergeWord = mergeFnwith.applyAsLong(skewWord & halftoneWord, dstLongAt(destIndex));
                    /* begin dstLongAt:put: */
                    dstLongAtput(destIndex, destMask & mergeWord);
                } else {
                    /* General version using dest masking */
                    /* begin dstLongAt: */
                    long destWord = dstLongAt(destIndex);
                    final long mergeWord = mergeFnwith.applyAsLong(skewWord & halftoneWord, destWord & destMask);
                    destWord = destMask & mergeWord | destWord & ~destMask;
                    /* begin dstLongAt:put: */
                    dstLongAtput(destIndex, destWord);
                }
                /* begin incDestIndex: */
                destIndex += 4;
                if (words == 2) {
                    /* e.g., is the next word the last word? */
                    /* set mask for last word in this row */
                    destMask = mask2;
                    nPix = endBits;
                } else {
                    /* use fullword mask for inner loop */
                    destMask = ALL_ONES;
                    nPix = destPPW;
                }
            } while (--words > 0);
            pAx += deltaP12x;
            pAy += deltaP12y;
            pBx += deltaP43x;
            pBy += deltaP43y;
            /* begin incDestIndex: */
            destIndex += destDelta;
        }
    }

    /* Setup values for faster pixel fetching. */

    /* BitBltSimulation>>#warpLoopSetup */
    private void warpLoopSetup() {
        /* warpSrcShift = log2(sourceDepth) */
        warpSrcShift = 0;
        /* recycle temp */
        long words = sourceDepth;
        while (words != 1) {
            warpSrcShift++;
            words = words >>> 1;
        }
        /* warpAlignShift: Shift for aligning x position to word boundary */
        warpSrcMask = MASK_TABLE[sourceDepth];
        /* warpAlignMask: Mask for extracting the pixel position from an x position */
        warpAlignShift = 5 - warpSrcShift;
        /* Setup the lookup table for source bit shifts */
        /* warpBitShiftTable: given an sub-word x value what's the bit shift? */
        warpAlignMask = shl(1, warpAlignShift) - 1;
        for (int i = 0; i <= warpAlignMask; i++) {
            if (sourceMSB) {
                warpBitShiftTable[i] = (int) (32 - shl(i + 1, warpSrcShift));
            } else {
                warpBitShiftTable[i] = (int) shl(i, warpSrcShift);
            }
        }
    }

    /*
     * Pick n (sub-) pixels from the source form, mapped by sourceMap, average the RGB values, map
     * by colorMap and return the new word. This version is only called from WarpBlt with
     * smoothingCount > 1
     */

    /*
     * BitBltSimulation>>#warpPickSmoothPixels:xDeltah:yDeltah:xDeltav:yDeltav:sourceMap:smoothing:
     * dstShiftInc:
     */
    private long warpPickSmoothPixelsxDeltahyDeltahxDeltavyDeltavsourceMapsmoothingdstShiftInc(final int nPixels, final long xDeltah, final long yDeltah, final long xDeltav,
                    final long yDeltav, final Object sourceMap, final boolean sourceMapIsWords, final long n, final long dstShiftInc) {
        /* nope - too much stuff in here */
        final int dstMask = MASK_TABLE[destDepth];
        long destWord = 0;
        final long xdh;
        final long xdv;
        final long ydh;
        final long ydv;
        if (n == 2) {
            /* Try avoiding divides for most common n (divide by 2 is generated as shift) */
            xdh = xDeltah / 2;
            ydh = yDeltah / 2;
            xdv = xDeltav / 2;
            ydv = yDeltav / 2;
        } else {
            xdh = div(xDeltah, n);
            ydh = div(yDeltah, n);
            xdv = div(xDeltav, n);
            ydv = div(yDeltav, n);
        }
        int i = nPixels;
        do {
            int x = sx;
            int y = sy;
            /* Pick and average n*n subpixels */
            long a = 0;
            long r = 0;
            long g = 0;
            long b = 0;
            /* actual number of pixels (not clipped and not transparent) */
            long nPix = 0;
            long j = n;
            do {
                int xx = x;
                int yy = y;
                long k = n;
                do {
                    long rgb = pickWarpPixelAtXy(xx, yy);
                    if (!(combinationRule == 25 && rgb == 0)) {
                        /* If not clipped and not transparent, then tally rgb values */
                        nPix++;
                        if (sourceDepth < 16) {
                            /* Get RGBA values from sourcemap table */
                            final int rawValue;
                            if (sourceMapIsWords) {
                                rawValue = UnsafeUtils.getInt((int[]) sourceMap, rgb);
                            } else {
                                rawValue = UnsafeUtils.getInt((byte[]) sourceMap, rgb);
                            }
                            rgb = Integer.toUnsignedLong(rawValue);
                        } else {
                            /* Already in RGB format */
                            if (sourceDepth == 16) {
                                rgb = rgbMap16To32(rgb);
                            } else {
                                rgb = rgbMap32To32(rgb);
                            }
                        }
                        b += rgb & 0xFF;
                        g += rgb >>> 8 & 0xFF;
                        r += rgb >>> 16 & 0xFF;
                        a += rgb >>> 24;
                    }
                    xx += xdh;
                    yy += ydh;
                } while (--k > 0);
                x += xdv;
                y += ydv;
            } while (--j > 0);
            long rgb;
            if (nPix == 0 || combinationRule == 25 && nPix < n * n / 2) {
                /* All pixels were 0, or most were transparent */
                rgb = 0;
            } else {
                /* normalize rgba sums */
                if (nPix == 4) {
                    /* Try to avoid divides for most common n */
                    r = r >>> 2;
                    g = g >>> 2;
                    b = b >>> 2;
                    a = a >>> 2;
                } else {
                    r = div(r, nPix);
                    g = div(g, nPix);
                    b = div(b, nPix);
                    a = div(a, nPix);
                }
                /* map the pixel */
                rgb = (a << 24) + (r << 16) + (g << 8) + b;
                if (rgb == 0 && r + g + b + a > 0) {
                    /* only generate zero if pixel is really transparent */
                    rgb = 1;
                }
                rgb = mapPixelflags(rgb, cmFlags);
            }
            destWord = destWord | shl(rgb & dstMask, dstBitShift);
            dstBitShift += dstShiftInc;
            sx += xDeltah;
            sy += yDeltah;
        } while (--i > 0);
        return destWord;
    }

    /*
     * Pick n pixels from the source form, map by colorMap and return aligned by dstBitShift. This
     * version is only called from WarpBlt with smoothingCount = 1
     */

    /* BitBltSimulation>>#warpPickSourcePixels:xDeltah:yDeltah:xDeltav:yDeltav:dstShiftInc:flags: */
    private long warpPickSourcePixelsxDeltahyDeltahxDeltavyDeltavdstShiftIncflags(final long nPixels, final long xDeltah, final long yDeltah, final long dstShiftInc, final long mapperFlags) {
        final int dstMask = MASK_TABLE[destDepth];
        long destWord = 0;
        long nPix = nPixels;
        if (mapperFlags == (COLOR_MAP_PRESENT | COLOR_MAP_INDEXED_PART)) {
            /* a little optimization for (pretty crucial) blits using indexed lookups only */
            /* grab, colormap and mix in pixel */
            do {
                final long sourcePix = pickWarpPixelAtXy(sx, sy);
                final long destPix = cmLookupTable[(int) (sourcePix & cmMask)];
                destWord = destWord | shl(destPix & dstMask, dstBitShift);
                dstBitShift += dstShiftInc;
                sx += xDeltah;
                sy += yDeltah;
            } while (--nPix > 0);
        } else {
            /* grab, colormap and mix in pixel */
            do {
                final long sourcePix = pickWarpPixelAtXy(sx, sy);
                final long destPix = mapPixelflags(sourcePix, mapperFlags);
                destWord = destWord | shl(destPix & dstMask, dstBitShift);
                dstBitShift += dstShiftInc;
                sx += xDeltah;
                sy += yDeltah;
            } while (--nPix > 0);
        }
        return destWord;
    }

    /*
     * POLYFILLS
     */

    private int fetchIntegerofObject(final int index, final AbstractPointersObject object) {
        final Object value = fetchPointerofObject(index, object);
        if (value instanceof Long) {
            return MiscUtils.toIntExact((long) value);
        } else {
            successFlag = false;
            return 0;
        }
    }

    private static PointersObject fetchPointerofObjectOrNull(final int index, final AbstractPointersObject object) {
        final Object value = fetchPointerofObject(index, object);
        if (value == NilObject.SINGLETON) {
            return null;
        } else {
            return (PointersObject) value;
        }
    }

    private static NativeObject fetchNativeofObjectOrNull(final int index, final AbstractPointersObject object) {
        final Object value = fetchPointerofObject(index, object);
        if (value == NilObject.SINGLETON) {
            return null;
        } else {
            return (NativeObject) value;
        }
    }

    private static Object fetchPointerofObject(final int index, final AbstractPointersObject object) {
        return object.instVarAt0Slow(index);
    }

    private static boolean isBytes(final NativeObject object) {
        return object.isByteType();
    }

    private static boolean isWords(final NativeObject object) {
        return object.isIntType();
    }

    private boolean isWords(final Object object) {
        return SqueakGuards.isNativeObject(object) && isWords((NativeObject) object);
    }

    private boolean isWordsOrBytes(final Object object) {
        return SqueakGuards.isNativeObject(object) && (isWords((NativeObject) object) || isBytes((NativeObject) object));
    }

    private static int slotSizeOfWords(final NativeObject object) {
        return object.getIntLength();
    }

    private static int slotSizeOf(final VariablePointersObject object) {
        return object.size();
    }

    private static int slotSizeOf(final PointersObject object) {
        return object.size();
    }

    private static boolean isPointers(final PointersObject object) {
        return object != null;
    }

    private static boolean isPointers(final Object object) {
        return object != null && object instanceof PointersObject;
    }

    private boolean failed() {
        return !successFlag;
    }

    protected void resetSuccessFlag() {
        successFlag = true;
    }

    private static int div(final long a, final long b) {
        return (int) (a / b);
    }

    private static int mod(final long a, final long b) {
        return (int) (a % b);
    }

    private static long shl(final long a, final long b) {
        return a << b;
    }

    private static long shr(final long a, final long b) {
        return a >>> b;
    }

    /* SmallInteger>>bitShift: */
    private static long shift(final long a, final long b) {
        return b < 0 ? a >>> -b : a << b;
    }

    private static void storeIntegerofObjectwithValue(final int index, final PointersObject target, final long value) {
        target.instVarAtPut0Slow(index, value);
    }

    private long dstLongAt(final long index) {
        assert index >>> 2 < endOfDestination;
        return Integer.toUnsignedLong(UnsafeUtils.getIntAt(destBits, destBitsBaseOffset + (index >>> 2) * destBitsIndexScale));
    }

    /*
     * Store the given value back into destination form, using dstMask to mask out the bits to be
     * modified. This is an essential read-modify-write operation on the destination form.
     */
    private void destLongAtputmask(final long dstIndex, final long dstMask, final long sourceWord) {
        long dstValue = dstLongAt(dstIndex);
        dstValue = dstValue & dstMask;
        dstValue = dstValue | sourceWord;
        dstLongAtput(dstIndex, dstValue);
    }

    private void dstLongAtput(final long index, final long value) {
        UnsafeUtils.putIntAt(destBits, destBitsBaseOffset + (index >>> 2) * destBitsIndexScale, (int) value);
    }

    private long halftoneLongAt(final long index) {
        return Integer.toUnsignedLong(halftoneBits[mod(index, halftoneHeight)]);
    }

    private long srcLongAt(final long index) {
        /**
         * Unfortunately, BitBlt tries to read past the end or before the start of
         * {@link sourceBits} sometimes, so return `0` in these cases. An example is
         * #testPivelValueAt (confirmed by SqueakJS's BitBltPlugin) or `PolygonMorph
         * arrowPrototype`.
         */
        if (0 <= index && index < endOfSource) {
            return Integer.toUnsignedLong(UnsafeUtils.getIntAt(sourceBits, sourceBitsBaseOffset + (index >>> 2) * sourceBitsIndexScale));
        } else {
            return 0L;
        }
    }
}