package layers; import android.renderscript.Allocation; import android.renderscript.Element; import android.renderscript.RenderScript; import android.renderscript.Type; import android.util.Log; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.util.Scanner; import messagepack.ParamUnpacker; import numdroid.MyNum; public class Convolution implements LayerInterface { private String name; // name of the layer private String paramFilePath; // name of the file which specifies the weights and biases private ParamUnpacker paramUnpacker; // for extracting the wieghts and biases from the parameters file private int[] stride; // strides private int[] pad; // pads private int group; // number of groups private MyNum myNum; // for mathematical calculations private RenderScript myRS; // RenderScript object private boolean nonLinear; // Does a non-linear layer follow this layer? private NonLinearType nonLinearType; // non-linearity type (if applicable) private boolean parallel; // implementation method (parallel or sequential) private boolean loadParamsAtStart; // if true, layer parameters will be loaded at the construction of network, otherwise the parameters will be loaded in run time private float[][][][] weight; // weight parameter of network private float[] bias; // bias parameter of network private String tuningFolder; // location to store online tuning results private boolean tuneNow; // flag to weather execute tuning ro not private boolean tuneFunc; // flag of optional tuning function private String algorithm; // acceleration method private String[] names = {"F4F1", "F4F2", "F4F4", "F4F8", "F8F1", "F8F2", "F8F4", "F8F8"}; private ScriptC_convRolledInF4OutF1 myScript41; private ScriptC_convRolledInF4OutF2 myScript42; private ScriptC_convRolledInF4OutF4 myScript44; private ScriptC_convRolledInF4OutF8 myScript48; private ScriptC_convRolledInF8OutF1 myScript81; private ScriptC_convRolledInF8OutF2 myScript82; private ScriptC_convRolledInF8OutF4 myScript84; private ScriptC_convRolledInF8OutF8 myScript88; // types of non-linear layer that may be appended to this layer public enum NonLinearType { RectifiedLinearUnit, None } public Convolution(int[] stride, int[] pad, int group, String paramFilePath, boolean parallel, boolean loadParamsAtStart, boolean tuneFunc, RenderScript myRS, String name, String tuningFolder) { this.paramFilePath = paramFilePath; this.stride = stride; this.pad = pad; this.group = group; this.nonLinearType = NonLinearType.None; this.nonLinear = false; this.myRS = myRS; this.parallel = parallel; this.name = name; this.myNum = new MyNum(); this.paramUnpacker = new ParamUnpacker(); this.loadParamsAtStart = loadParamsAtStart; this.tuneFunc = tuneFunc; this.tuningFolder = tuningFolder; tuneNow = false; File f = new File(tuningFolder + "/" + name + ".txt"); try { Scanner s = new Scanner(f); algorithm = s.nextLine(); if (corrupted(algorithm)) tuneNow = true; } catch (FileNotFoundException e) { tuneNow = true; } if (!tuneFunc) { algorithm = "F8F4"; tuneNow = false; } if (loadParamsAtStart && (!tuneNow || !parallel)) { long loadTime = System.currentTimeMillis(); Object[] objects = paramUnpacker.unpackerFunction(paramFilePath, new Class[]{float[][][][].class, float[].class}); weight = (float[][][][]) objects[0]; bias = (float[]) objects[1]; loadTime = System.currentTimeMillis() - loadTime; long kernelTime = System.currentTimeMillis(); Log.d("CNNdroid", "layers." + name + ": Parameters Load Time in Constructor = " + String.valueOf(loadTime)); if (parallel) { switch (algorithm) { case "F4F1": initKernelF4F1(weight, bias); break; case "F4F2": initKernelF4F2(weight, bias); break; case "F4F4": initKernelF4F4(weight, bias); break; case "F4F8": initKernelF4F8(weight, bias); break; case "F8F1": initKernelF8F1(weight, bias); break; case "F8F2": initKernelF8F2(weight, bias); break; case "F8F4": initKernelF8F4(weight, bias); break; case "F8F8": initKernelF8F8(weight, bias); break; } kernelTime = System.currentTimeMillis() - kernelTime; Log.d("CNNdroid", "layers." + name + ": Kernel Initialization Time in Constructor = " + String.valueOf(kernelTime)); } } } public void setNonLinearType(NonLinearType nonLinearType) { this.nonLinearType = nonLinearType; nonLinear = true; } @Override public Object compute(Object input) { long loadTime; if (!loadParamsAtStart && (!tuneNow || !parallel)) { loadTime = System.currentTimeMillis(); Object[] objects = paramUnpacker.unpackerFunction(paramFilePath, new Class[]{float[][][][].class, float[].class}); float[][][][] localWeight = (float[][][][]) objects[0]; float[] localBias = (float[]) objects[1]; if (parallel){ switch (algorithm) { case "F4F1": initKernelF4F1(localWeight, localBias); break; case "F4F2": initKernelF4F2(localWeight, localBias); break; case "F4F4": initKernelF4F4(localWeight, localBias); break; case "F4F8": initKernelF4F8(localWeight, localBias); break; case "F8F1": initKernelF8F1(localWeight, localBias); break; case "F8F2": initKernelF8F2(localWeight, localBias); break; case "F8F4": initKernelF8F4(localWeight, localBias); break; case "F8F8": initKernelF8F8(localWeight, localBias); break; } } loadTime = System.currentTimeMillis() - loadTime; Log.d("CNNdroid", "layers." + name + ": Parameters Load Time = " + String.valueOf(loadTime)); return invokeFunctions(input, localWeight, localBias, true); } else { return invokeFunctions(input,weight, bias, false); } } ///////////////////////////////////////Sequential/////////////////////////////////////////////// private float[][][][] convLayerRolledSeq(float[][][][] inputBlob, float[][][][] filterBlob, float[] biasBlob, int[] pad, int[] stride, int group) { /* Convolution Layer Inputs: kernel[0] is a filter blob. kernel[1] is bias blob. */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = filterBlob.length; int c_k = filterBlob[0].length; int h_k = filterBlob[0][0].length; int w_k = filterBlob[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; // calculate the result for (int n = 0; n < (n_i); n++) // for n in images for (int k = 0; k < (n_k / group); k++)// for k in kernels for (int g = 0; g < (group); g++) { float[][][] convInFrame = new float[(c_i / group)][h_i][w_i]; float[][][] convInKernel = new float[(c_i / group)][h_i][w_i]; int temp = g * c_i / group; for (int i = g * c_i / group; i < (g + 1) * c_i / group; i++) // copy part of inputBlob convInFrame[i - temp] = inputBlob[n][i]; convInKernel = filterBlob[g * n_k / group + k]; // copy outputBlob[n][k + g * n_k / group] = convRolledSeq(convInFrame, convInKernel, biasBlob[g * n_k / group + k], pad, stride); } // return the result return outputBlob; } private float[][] convRolledSeq(float[][][] frames, float[][][] kernel, float bias, int[] pad, int[] stride) { // Calculate final dimensions. int c_i = frames.length; int h_i = frames[0].length; int w_i = frames[0][0].length; int c_k = kernel.length; int h_k = kernel[0].length; int w_k = kernel[0][0].length; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) stride[0])) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) stride[1])) + 1); int h_s = stride[0]; int w_s = stride[1]; float[][] out = new float[h_o][w_o]; // Compute pixel values. for (int i = 0; i < h_o; ++i) for (int j = 0; j < w_o; ++j) out[i][j] = myNum.sum_conv(frames, kernel, i * h_s, j * w_s, pad[0], pad[1]) + bias; return out; } ////////////////////////////////////////Parallel//////////////////////////////////////////////// // Input: Float4 ***** Output: Float private float[][][][] convLayerRolledParInF4OutF1(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; int c_i_4 = c_i; if (c_i % (4 * group) != 0) c_i_4 = c_i + (4 * group) - c_i % (4 * group); //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation outAllocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_4 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32(myRS), h_o * w_o * n_k); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); outAllocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript41.set_c_i(c_i_4); myScript41.set_h_i(h_i); myScript41.set_w_i(w_i); myScript41.set_h_o(h_o); myScript41.set_w_o(w_o); // calculate the result float[] outMatrix = new float[h_o * w_o * n_k]; float[] frameMatrix = new float[h_i * w_i * c_i_4]; int delta_c = (c_i_4 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript41.set_In_Blob(frameAllocation); myScript41.forEach_root(outAllocation); if (n < n_i - 1) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { outputBlob[n - 1][i][j][k] = outMatrix[i * h_o * w_o + j * w_o + k]; if (nonLinear) if (outputBlob[n - 1][i][j][k] < 0) outputBlob[n - 1][i][j][k] = 0; } } outAllocation.copyTo(outMatrix); if (n == n_i - 1) { for (int i = 0; i < n_k; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { outputBlob[n][i][j][k] = outMatrix[i * h_o * w_o + j * w_o + k]; if (nonLinear) if (outputBlob[n][i][j][k] < 0) outputBlob[n][i][j][k] = 0; } } } frameAllocation.destroy(); outAllocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript41.destroy(); // return the result return outputBlob; } // Input: Float4 ***** Output: Float2 private float[][][][] convLayerRolledParInF4OutF2(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; int c_i_4 = c_i; if (c_i % (4 * group) != 0) c_i_4 = c_i + (4 * group) - c_i % (4 * group); int n_k_2 = n_k; if (n_k % (2 * group) != 0) n_k_2 = n_k + (2 * group) - n_k % (2 * group); int delta_n = (n_k_2 - n_k) / group; //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation outAllocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_4 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32_2(myRS), h_o * w_o * n_k_2 / 2); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); outAllocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript42.set_c_i(c_i_4); myScript42.set_h_i(h_i); myScript42.set_w_i(w_i); myScript42.set_h_o(h_o); myScript42.set_w_o(w_o); // calculate the result float[] outMatrix = new float[h_o * w_o * n_k_2]; float[] frameMatrix = new float[h_i * w_i * c_i_4]; int delta_c = (c_i_4 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript42.set_In_Blob(frameAllocation); myScript42.forEach_root(outAllocation); if (n < n_i - 1) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k_2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_2 / group - delta_n) { outputBlob[n - 1][i][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n - 1][i][j][k] < 0) outputBlob[n - 1][i][j][k] = 0; } } else if ((i >= n_k_2 / group) && (i < n_k_2 - delta_n)) { outputBlob[n - 1][i - delta_n][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n - 1][i - delta_n][j][k] < 0) outputBlob[n - 1][i - delta_n][j][k] = 0; } } } } outAllocation.copyTo(outMatrix); if (n == n_i - 1) { for (int i = 0; i < n_k_2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_2 / group - delta_n) { outputBlob[n][i][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n][i][j][k] < 0) outputBlob[n][i][j][k] = 0; } } else if ((i >= n_k_2 / group) && (i < n_k_2 - delta_n)) { outputBlob[n][i - delta_n][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n][i - delta_n][j][k] < 0) outputBlob[n][i - delta_n][j][k] = 0; } } } } } frameAllocation.destroy(); outAllocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript42.destroy(); // return the result return outputBlob; } // Input: Float4 ***** Output: Float4 private float[][][][] convLayerRolledParInF4OutF4(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; //check channel count int c_k_4 = c_k; if (c_k % 4 != 0) c_k_4 = c_k + 4 - c_k % 4; int c_i_4 = c_i; if (c_i % 4 != 0) c_i_4 = c_i + 4 - c_i % 4; int n_k_4 = n_k; if (n_k % 4 != 0) n_k_4 = n_k + 4 - n_k % 4; int delta_n = (n_k_4 - n_k) / group; //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation outAllocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_4 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32_4(myRS), h_o * w_o * n_k_4 / 4); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); outAllocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript44.set_c_i(c_i_4); myScript44.set_h_i(h_i); myScript44.set_w_i(w_i); myScript44.set_h_o(h_o); myScript44.set_w_o(w_o); // calculate the result float[] outMatrix = new float[h_o * w_o * n_k_4]; float[] frameMatrix = new float[h_i * w_i * c_i_4]; int delta_c = (c_i_4 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript44.set_In_Blob(frameAllocation); myScript44.forEach_root(outAllocation); if (n < n_i - 1) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k_4; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_4 / group - delta_n) { outputBlob[n - 1][i][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n - 1][i][j][k] < 0) outputBlob[n - 1][i][j][k] = 0; } } else if ((i >= n_k_4 / group) && (i < n_k_4 - delta_n)) { outputBlob[n - 1][i - delta_n][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n - 1][i - delta_n][j][k] < 0) outputBlob[n - 1][i - delta_n][j][k] = 0; } } } } outAllocation.copyTo(outMatrix); if (n == n_i - 1) { for (int i = 0; i < n_k_4; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_4 / group - delta_n) { outputBlob[n][i][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n][i][j][k] < 0) outputBlob[n][i][j][k] = 0; } } else if ((i >= n_k_4 / group) && (i < n_k_4 - delta_n)) { outputBlob[n][i - delta_n][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n][i - delta_n][j][k] < 0) outputBlob[n][i - delta_n][j][k] = 0; } } } } } frameAllocation.destroy(); outAllocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript44.destroy(); // return the result return outputBlob; } // Input: Float4 ***** Output: Float8 private float[][][][] convLayerRolledParInF4OutF8(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; //check channel count int c_i_4 = c_i; if (c_i % 4 != 0) c_i_4 = c_i + 4 - c_i % 4; int n_k_8 = n_k; if (n_k % 8 != 0) n_k_8 = n_k + 8 - n_k % 8; int delta_n = (n_k_8 - n_k) / group; //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation out1Allocation; Allocation out2Allocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_4 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32_4(myRS), h_o * w_o * n_k_8 / 8); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); out1Allocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); out2Allocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript48.set_Out_Alloc(out2Allocation); myScript48.set_c_i(c_i_4); myScript48.set_h_i(h_i); myScript48.set_w_i(w_i); myScript48.set_h_o(h_o); myScript48.set_w_o(w_o); // calculate the result float[] out1Matrix = new float[n_k_8 * h_o * w_o / 2]; float[] out2Matrix = new float[n_k_8 * h_o * w_o / 2]; float[] frameMatrix = new float[h_i * w_i * c_i_4]; int delta_c = (c_i_4 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript48.set_In_Blob(frameAllocation); myScript48.forEach_root(out1Allocation); if (n < n_i - 1) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k_8 / 2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (2 * i < n_k_8 / group - delta_n) { outputBlob[n - 1][2 * i][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i][j][k] < 0) outputBlob[n - 1][2 * i][j][k] = 0; } } else if ((2 * i >= n_k_8 / group) && (2 * i < n_k_8 - delta_n)) { outputBlob[n - 1][2 * i - delta_n][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i - delta_n][j][k] < 0) outputBlob[n - 1][2 * i - delta_n][j][k] = 0; } } if (2 * i + 1 < n_k_8 / group - delta_n) { outputBlob[n - 1][2 * i + 1][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i + 1][j][k] < 0) outputBlob[n - 1][2 * i + 1][j][k] = 0; } } else if ((2 * i + 1 >= n_k_8 / group) && (2 * i + 1 < n_k_8 - delta_n)) { outputBlob[n - 1][2 * i + 1 - delta_n][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i + 1 - delta_n][j][k] < 0) outputBlob[n - 1][2 * i + 1 - delta_n][j][k] = 0; } } } } out1Allocation.copyTo(out1Matrix); out2Allocation.copyTo(out2Matrix); if (n == n_i - 1) { for (int i = 0; i < n_k_8 / 2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (2 * i < n_k_8 / group - delta_n) { outputBlob[n][2 * i][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i][j][k] < 0) outputBlob[n][2 * i][j][k] = 0; } } else if ((2 * i >= n_k_8 / group) && (2 * i < n_k_8 - delta_n)) { outputBlob[n][2 * i - delta_n][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i - delta_n][j][k] < 0) outputBlob[n][2 * i - delta_n][j][k] = 0; } } if (2 * i + 1 < n_k_8 / group - delta_n) { outputBlob[n][2 * i + 1][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i + 1][j][k] < 0) outputBlob[n][2 * i + 1][j][k] = 0; } } else if ((2 * i + 1 >= n_k_8 / group) && (2 * i + 1 < n_k_8 - delta_n)) { outputBlob[n][2 * i + 1 - delta_n][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i + 1 - delta_n][j][k] < 0) outputBlob[n][2 * i + 1 - delta_n][j][k] = 0; } } } } } frameAllocation.destroy(); out1Allocation.destroy(); out2Allocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript48.destroy(); // return the result return outputBlob; } // Input: Float8 ***** Output: Float1 private float[][][][] convLayerRolledParInF8OutF1(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; int c_i_8 = c_i; if (c_i % (8 * group) != 0) c_i_8 = c_i + (8 * group) - c_i % (8 * group); int delta_n = (n_k - n_k) / group; //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation outAllocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_8 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32(myRS), h_o * w_o * n_k); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); outAllocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript81.set_c_i(c_i_8); myScript81.set_h_i(h_i); myScript81.set_w_i(w_i); myScript81.set_h_o(h_o); myScript81.set_w_o(w_o); // calculate the result float[] outMatrix = new float[h_o * w_o * n_k]; float[] frameMatrix = new float[h_i * w_i * c_i_8]; int delta_c = (c_i_8 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_8; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_8 / group - delta_c) && (i < c_i_8 / group)) || (i >= c_i_8 - delta_c)) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = 0; else if (i >= c_i_8 / group) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript81.set_In_Blob(frameAllocation); myScript81.forEach_root(outAllocation); if (n < n_i - 1) { for (int i = 0; i < c_i_8; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_8 / group - delta_c) && (i < c_i_8 / group)) || (i >= c_i_8 - delta_c)) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = 0; else if (i >= c_i_8 / group) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k / group - delta_n) { outputBlob[n - 1][i][j][k] = outMatrix[j * w_o * n_k + k * n_k + i]; if (nonLinear) { if (outputBlob[n - 1][i][j][k] < 0) outputBlob[n - 1][i][j][k] = 0; } } else if ((i >= n_k / group) && (i < n_k - delta_n)) { outputBlob[n - 1][i - delta_n][j][k] = outMatrix[j * w_o * n_k + k * n_k + i]; if (nonLinear) { if (outputBlob[n - 1][i - delta_n][j][k] < 0) outputBlob[n - 1][i - delta_n][j][k] = 0; } } } } outAllocation.copyTo(outMatrix); if (n == n_i - 1) { for (int i = 0; i < n_k; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k / group - delta_n) { outputBlob[n][i][j][k] = outMatrix[j * w_o * n_k + k * n_k + i]; if (nonLinear) { if (outputBlob[n][i][j][k] < 0) outputBlob[n][i][j][k] = 0; } } else if ((i >= n_k / group) && (i < n_k - delta_n)) { outputBlob[n][i - delta_n][j][k] = outMatrix[j * w_o * n_k + k * n_k + i]; if (nonLinear) { if (outputBlob[n][i - delta_n][j][k] < 0) outputBlob[n][i - delta_n][j][k] = 0; } } } } } frameAllocation.destroy(); outAllocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript81.destroy(); // return the result return outputBlob; } // Input: Float8 ***** Output: Float2 private float[][][][] convLayerRolledParInF8OutF2(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; int c_i_8 = c_i; if (c_i % (8 * group) != 0) c_i_8 = c_i + (8 * group) - c_i % (8 * group); int n_k_2 = n_k; if (n_k % (2 * group) != 0) n_k_2 = n_k + (2 * group) - n_k % (2 * group); int delta_n = (n_k_2 - n_k) / group; //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation outAllocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_8 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32_2(myRS), h_o * w_o * n_k_2 / 2); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); outAllocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript82.set_c_i(c_i_8); myScript82.set_h_i(h_i); myScript82.set_w_i(w_i); myScript82.set_h_o(h_o); myScript82.set_w_o(w_o); // calculate the result float[] outMatrix = new float[h_o * w_o * n_k_2]; float[] frameMatrix = new float[h_i * w_i * c_i_8]; int delta_c = (c_i_8 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_8; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_8 / group - delta_c) && (i < c_i_8 / group)) || (i >= c_i_8 - delta_c)) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = 0; else if (i >= c_i_8 / group) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript82.set_In_Blob(frameAllocation); myScript82.forEach_root(outAllocation); if (n < n_i - 1) { for (int i = 0; i < c_i_8; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_8 / group - delta_c) && (i < c_i_8 / group)) || (i >= c_i_8 - delta_c)) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = 0; else if (i >= c_i_8 / group) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k_2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_2 / group - delta_n) { outputBlob[n - 1][i][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n - 1][i][j][k] < 0) outputBlob[n - 1][i][j][k] = 0; } } else if ((i >= n_k_2 / group) && (i < n_k_2 - delta_n)) { outputBlob[n - 1][i - delta_n][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n - 1][i - delta_n][j][k] < 0) outputBlob[n - 1][i - delta_n][j][k] = 0; } } } } outAllocation.copyTo(outMatrix); if (n == n_i - 1) { for (int i = 0; i < n_k_2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_2 / group - delta_n) { outputBlob[n][i][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n][i][j][k] < 0) outputBlob[n][i][j][k] = 0; } } else if ((i >= n_k_2 / group) && (i < n_k_2 - delta_n)) { outputBlob[n][i - delta_n][j][k] = outMatrix[j * w_o * n_k_2 + k * n_k_2 + i]; if (nonLinear) { if (outputBlob[n][i - delta_n][j][k] < 0) outputBlob[n][i - delta_n][j][k] = 0; } } } } } frameAllocation.destroy(); outAllocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript82.destroy(); // return the result return outputBlob; } // Input: Float8 ***** Output: Float4 private float[][][][] convLayerRolledParInF8OutF4(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; int c_i_8 = c_i; if (c_i % (8 * group) != 0) c_i_8 = c_i + (8 * group) - c_i % (8 * group); int n_k_4 = n_k; if (n_k % (4 * group) != 0) n_k_4 = n_k + (4 * group) - n_k % (4 * group); int delta_n = (n_k_4 - n_k) / group; //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation outAllocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_8 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32_4(myRS), h_o * w_o * n_k_4 / 4); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); outAllocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript84.set_c_i(c_i_8); myScript84.set_h_i(h_i); myScript84.set_w_i(w_i); myScript84.set_h_o(h_o); myScript84.set_w_o(w_o); // calculate the result float[] outMatrix = new float[h_o * w_o * n_k_4]; float[] frameMatrix = new float[h_i * w_i * c_i_8]; int delta_c = (c_i_8 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_8; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_8 / group - delta_c) && (i < c_i_8 / group)) || (i >= c_i_8 - delta_c)) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = 0; else if (i >= c_i_8 / group) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript84.set_In_Blob(frameAllocation); myScript84.forEach_root(outAllocation); if (n < n_i - 1) { for (int i = 0; i < c_i_8; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_8 / group - delta_c) && (i < c_i_8 / group)) || (i >= c_i_8 - delta_c)) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = 0; else if (i >= c_i_8 / group) frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_8 + k * c_i_8 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k_4; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_4 / group - delta_n) { outputBlob[n - 1][i][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n - 1][i][j][k] < 0) outputBlob[n - 1][i][j][k] = 0; } } else if ((i >= n_k_4 / group) && (i < n_k_4 - delta_n)) { outputBlob[n - 1][i - delta_n][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n - 1][i - delta_n][j][k] < 0) outputBlob[n - 1][i - delta_n][j][k] = 0; } } } } outAllocation.copyTo(outMatrix); if (n == n_i - 1) { for (int i = 0; i < n_k_4; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (i < n_k_4 / group - delta_n) { outputBlob[n][i][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n][i][j][k] < 0) outputBlob[n][i][j][k] = 0; } } else if ((i >= n_k_4 / group) && (i < n_k_4 - delta_n)) { outputBlob[n][i - delta_n][j][k] = outMatrix[j * w_o * n_k_4 + k * n_k_4 + i]; if (nonLinear) { if (outputBlob[n][i - delta_n][j][k] < 0) outputBlob[n][i - delta_n][j][k] = 0; } } } } } frameAllocation.destroy(); outAllocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript84.destroy(); // return the result return outputBlob; } // Input: Float8 ***** Output: Float8 private float[][][][] convLayerRolledParInF8OutF8(float[][][][] inputBlob, float[][][][] myWeight, boolean destroy) { /* Convolution layer. Inputs: kernel[0] is a filter blob kernel[1] is bias blob */ // calculate sizes //(n_i, c_i, h_i, w_i) = inputBlob.shape int n_i = inputBlob.length; int c_i = inputBlob[0].length; int h_i = inputBlob[0][0].length; int w_i = inputBlob[0][0][0].length; //(n_k, c_k, h_k, w_k) = kernel_blob[0].shape int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int n_o = n_i; int h_o = (int) (Math.ceil((h_i + 2 * pad[0] - h_k) / ((float) (stride[0]))) + 1); int w_o = (int) (Math.ceil((w_i + 2 * pad[1] - w_k) / ((float) (stride[1]))) + 1); int c_o = n_k; // initialize the result float[][][][] outputBlob = new float[n_o][c_o][h_o][w_o]; int c_i_4 = c_i; if (c_i % (8 * group) != 0) c_i_4 = c_i + (8 * group) - c_i % (8 * group); int n_k_8 = n_k; if (n_k % (8 * group) != 0) n_k_8 = n_k + (8 * group) - n_k % (8 * group); int delta_n = (n_k_8 - n_k) / group; //initialize Renderscript Type inputType, outType; Allocation frameAllocation; Allocation out1Allocation; Allocation out2Allocation; inputType = Type.createX(myRS, Element.F32_4(myRS), c_i_4 * h_i * w_i / 4); outType = Type.createX(myRS, Element.F32_4(myRS), h_o * w_o * n_k_8 / 8); frameAllocation = Allocation.createTyped(myRS, inputType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); out1Allocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); out2Allocation = Allocation.createTyped(myRS, outType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); myScript88.set_Out_Alloc(out2Allocation); myScript88.set_c_i(c_i_4); myScript88.set_h_i(h_i); myScript88.set_w_i(w_i); myScript88.set_h_o(h_o); myScript88.set_w_o(w_o); // calculate the result float[] out1Matrix = new float[n_k_8 * h_o * w_o / 2]; float[] out2Matrix = new float[n_k_8 * h_o * w_o / 2]; float[] frameMatrix = new float[h_i * w_i * c_i_4]; int delta_c = (c_i_4 - c_i) / group; for (int n = 0; n < (n_i); n++) {// for n in images if (n == 0) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n][i][j][k]; } } frameAllocation.copyFrom(frameMatrix); myScript88.set_In_Blob(frameAllocation); myScript88.forEach_root(out1Allocation); if (n < n_i - 1) { for (int i = 0; i < c_i_4; i++) for (int j = 0; j < h_i; j++) for (int k = 0; k < w_i; k++) { if (((i >= c_i_4 / group - delta_c) && (i < c_i_4 / group)) || (i >= c_i_4 - delta_c)) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = 0; else if (i >= c_i_4 / group) frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i - delta_c][j][k]; else frameMatrix[j * w_i * c_i_4 + k * c_i_4 + i] = inputBlob[n + 1][i][j][k]; } } if (n > 0) { for (int i = 0; i < n_k_8 / 2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (2 * i < n_k_8 / group - delta_n) { outputBlob[n - 1][2 * i][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i][j][k] < 0) outputBlob[n - 1][2 * i][j][k] = 0; } } else if ((2 * i >= n_k_8 / group) && (2 * i < n_k_8 - delta_n)) { outputBlob[n - 1][2 * i - delta_n][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i - delta_n][j][k] < 0) outputBlob[n - 1][2 * i - delta_n][j][k] = 0; } } if (2 * i + 1 < n_k_8 / group - delta_n) { outputBlob[n - 1][2 * i + 1][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i + 1][j][k] < 0) outputBlob[n - 1][2 * i + 1][j][k] = 0; } } else if ((2 * i + 1 >= n_k_8 / group) && (2 * i + 1 < n_k_8 - delta_n)) { outputBlob[n - 1][2 * i + 1 - delta_n][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n - 1][2 * i + 1 - delta_n][j][k] < 0) outputBlob[n - 1][2 * i + 1 - delta_n][j][k] = 0; } } } } out1Allocation.copyTo(out1Matrix); out2Allocation.copyTo(out2Matrix); if (n == n_i - 1) { for (int i = 0; i < n_k_8 / 2; i++) for (int j = 0; j < h_o; j++) for (int k = 0; k < w_o; k++) { if (2 * i < n_k_8 / group - delta_n) { outputBlob[n][2 * i][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i][j][k] < 0) outputBlob[n][2 * i][j][k] = 0; } } else if ((2 * i >= n_k_8 / group) && (2 * i < n_k_8 - delta_n)) { outputBlob[n][2 * i - delta_n][j][k] = out1Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i - delta_n][j][k] < 0) outputBlob[n][2 * i - delta_n][j][k] = 0; } } if (2 * i + 1 < n_k_8 / group - delta_n) { outputBlob[n][2 * i + 1][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i + 1][j][k] < 0) outputBlob[n][2 * i + 1][j][k] = 0; } } else if ((2 * i + 1 >= n_k_8 / group) && (2 * i + 1 < n_k_8 - delta_n)) { outputBlob[n][2 * i + 1 - delta_n][j][k] = out2Matrix[j * w_o * n_k_8 / 2 + k * n_k_8 / 2 + i]; if (nonLinear) { if (outputBlob[n][2 * i + 1 - delta_n][j][k] < 0) outputBlob[n][2 * i + 1 - delta_n][j][k] = 0; } } } } } frameAllocation.destroy(); out1Allocation.destroy(); out2Allocation.destroy(); inputType.destroy(); outType.destroy(); if (destroy) myScript88.destroy(); // return the result return outputBlob; } ///////////////////////////////Kernel Initialization Functions////////////////////////////////// private void initKernelF4F1(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_4 = c_k; if (c_k % 4 != 0) c_k_4 = c_k + 4 - c_k % 4; Allocation kernelAllocation; Allocation biasAllocation; Type kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k * c_k_4 * h_k * w_k / 4); Type biasType = Type.createX(myRS, Element.F32(myRS), n_k); float[] kernelMatrix = new float[n_k * h_k * w_k * c_k_4]; float[] biasArray = new float[n_k]; int delta_n = (n_k - n_k) / group; for (int i = 0; i < n_k; i++) for (int j = 0; j < c_k_4; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k / group - delta_n) && (i < n_k / group)) || (i >= n_k - delta_n)) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = 0; else if (i >= n_k / group) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k; i++) { if (((i >= n_k / group - delta_n) && (i < n_k / group)) || (i >= n_k - delta_n)) biasArray[i] = 0; else if (i >= n_k / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript41 = new ScriptC_convRolledInF4OutF1(myRS); myScript41.set_Bias_Blob(biasAllocation); myScript41.set_Kernel_Blob(kernelAllocation); myScript41.set_n_k(n_k); myScript41.set_c_k(c_k_4); myScript41.set_h_k(h_k); myScript41.set_w_k(w_k); myScript41.set_pad_x(pad[0]); myScript41.set_pad_y(pad[1]); myScript41.set_stride_x(stride[0]); myScript41.set_stride_y(stride[1]); myScript41.set_group(group); } private void initKernelF4F2(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_4 = c_k; if (c_k % 4 != 0) c_k_4 = c_k + 4 - c_k % 4; int n_k_2 = n_k; if (n_k % 2 != 0) n_k_2 = n_k + 2 - n_k % 2; Allocation kernelAllocation; Allocation biasAllocation; Type kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k_2 * c_k_4 * h_k * w_k / 4); Type biasType = Type.createX(myRS, Element.F32_2(myRS), n_k_2 / 2); float[] kernelMatrix = new float[n_k_2 * h_k * w_k * c_k_4]; float[] biasArray = new float[n_k_2]; int delta_n = (n_k_2 - n_k) / group; for (int i = 0; i < n_k_2; i++) for (int j = 0; j < c_k_4; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k_2 / group - delta_n) && (i < n_k_2 / group)) || (i >= n_k_2 - delta_n)) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = 0; else if (i >= n_k_2 / group) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k_2; i++) { if (((i >= n_k_2 / group - delta_n) && (i < n_k_2 / group)) || (i >= n_k_2 - delta_n)) biasArray[i] = 0; else if (i >= n_k_2 / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript42 = new ScriptC_convRolledInF4OutF2(myRS); myScript42.set_Bias_Blob(biasAllocation); myScript42.set_Kernel_Blob(kernelAllocation); myScript42.set_n_k(n_k_2); myScript42.set_c_k(c_k_4); myScript42.set_h_k(h_k); myScript42.set_w_k(w_k); myScript42.set_pad_x(pad[0]); myScript42.set_pad_y(pad[1]); myScript42.set_stride_x(stride[0]); myScript42.set_stride_y(stride[1]); myScript42.set_group(group); } private void initKernelF4F4(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_4 = c_k; if (c_k % 4 != 0) c_k_4 = c_k + 4 - c_k % 4; int n_k_4 = n_k; if (n_k % 4 != 0) n_k_4 = n_k + 4 - n_k % 4; Allocation kernelAllocation; Allocation biasAllocation; Type kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k_4 * c_k_4 * h_k * w_k / 4); Type biasType = Type.createX(myRS, Element.F32_4(myRS), n_k_4 / 4); float[] kernelMatrix = new float[n_k_4 * h_k * w_k * c_k_4]; float[] biasArray = new float[n_k_4]; int delta_n = (n_k_4 - n_k) / group; for (int i = 0; i < n_k_4; i++) for (int j = 0; j < c_k_4; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k_4 / group - delta_n) && (i < n_k_4 / group)) || (i >= n_k_4 - delta_n)) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = 0; else if (i >= n_k_4 / group) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k_4; i++) { if (((i >= n_k_4 / group - delta_n) && (i < n_k_4 / group)) || (i >= n_k_4 - delta_n)) biasArray[i] = 0; else if (i >= n_k_4 / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript44 = new ScriptC_convRolledInF4OutF4(myRS); myScript44.set_Bias_Blob(biasAllocation); myScript44.set_Kernel_Blob(kernelAllocation); myScript44.set_n_k(n_k_4); myScript44.set_c_k(c_k_4); myScript44.set_h_k(h_k); myScript44.set_w_k(w_k); myScript44.set_pad_x(pad[0]); myScript44.set_pad_y(pad[1]); myScript44.set_stride_x(stride[0]); myScript44.set_stride_y(stride[1]); myScript44.set_group(group); } private void initKernelF4F8(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_4 = c_k; if (c_k % 4 != 0) c_k_4 = c_k + 4 - c_k % 4; int n_k_8 = n_k; if (n_k % 8 != 0) n_k_8 = n_k + 8 - n_k % 8; Type kernelType, biasType; Allocation kernelAllocation; Allocation biasAllocation; kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k_8 * c_k_4 * h_k * w_k / 4); biasType = Type.createX(myRS, Element.F32_4(myRS), n_k_8 / 4); float[] kernelMatrix = new float[n_k_8 * h_k * w_k * c_k_4]; float[] biasArray = new float[n_k_8]; int delta_n = (n_k_8 - n_k) / group; for (int i = 0; i < n_k_8; i++) for (int j = 0; j < c_k_4; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k_8 / group - delta_n) && (i < n_k_8 / group)) || (i >= n_k_8 - delta_n)) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = 0; else if (i >= n_k_8 / group) kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_4 + k * w_k * c_k_4 + l * c_k_4 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k_8; i++) { if (((i >= n_k_8 / group - delta_n) && (i < n_k_8 / group)) || (i >= n_k_8 - delta_n)) biasArray[i] = 0; else if (i >= n_k_8 / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript48 = new ScriptC_convRolledInF4OutF8(myRS); myScript48.set_Bias_Blob(biasAllocation); myScript48.set_Kernel_Blob(kernelAllocation); myScript48.set_n_k(n_k_8); myScript48.set_c_k(c_k_4); myScript48.set_h_k(h_k); myScript48.set_w_k(w_k); myScript48.set_pad_x(pad[0]); myScript48.set_pad_y(pad[1]); myScript48.set_stride_x(stride[0]); myScript48.set_stride_y(stride[1]); myScript48.set_group(group); } private void initKernelF8F1(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_8 = c_k; if (c_k % 8 != 0) c_k_8 = c_k + 8 - c_k % 8; Allocation kernelAllocation; Allocation biasAllocation; Type kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k * c_k_8 * h_k * w_k / 4); Type biasType = Type.createX(myRS, Element.F32(myRS), n_k); float[] kernelMatrix = new float[n_k * h_k * w_k * c_k_8]; float[] biasArray = new float[n_k]; int delta_n = (n_k - n_k) / group; for (int i = 0; i < n_k; i++) for (int j = 0; j < c_k_8; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k / group - delta_n) && (i < n_k / group)) || (i >= n_k - delta_n)) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = 0; else if (i >= n_k / group) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k; i++) { if (((i >= n_k / group - delta_n) && (i < n_k / group)) || (i >= n_k - delta_n)) biasArray[i] = 0; else if (i >= n_k / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript81 = new ScriptC_convRolledInF8OutF1(myRS); myScript81.set_Bias_Blob(biasAllocation); myScript81.set_Kernel_Blob(kernelAllocation); myScript81.set_n_k(n_k); myScript81.set_c_k(c_k_8); myScript81.set_h_k(h_k); myScript81.set_w_k(w_k); myScript81.set_pad_x(pad[0]); myScript81.set_pad_y(pad[1]); myScript81.set_stride_x(stride[0]); myScript81.set_stride_y(stride[1]); myScript81.set_group(group); } private void initKernelF8F2(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_8 = c_k; if (c_k % 8 != 0) c_k_8 = c_k + 8 - c_k % 8; int n_k_2 = n_k; if (n_k % 2 != 0) n_k_2 = n_k + 2 - n_k % 2; Allocation kernelAllocation; Allocation biasAllocation; Type kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k_2 * c_k_8 * h_k * w_k / 4); Type biasType = Type.createX(myRS, Element.F32_2(myRS), n_k_2 / 2); float[] kernelMatrix = new float[n_k_2 * h_k * w_k * c_k_8]; float[] biasArray = new float[n_k_2]; int delta_n = (n_k_2 - n_k) / group; for (int i = 0; i < n_k_2; i++) for (int j = 0; j < c_k_8; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k_2 / group - delta_n) && (i < n_k_2 / group)) || (i >= n_k_2 - delta_n)) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = 0; else if (i >= n_k_2 / group) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k_2; i++) { if (((i >= n_k_2 / group - delta_n) && (i < n_k_2 / group)) || (i >= n_k_2 - delta_n)) biasArray[i] = 0; else if (i >= n_k_2 / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript82 = new ScriptC_convRolledInF8OutF2(myRS); myScript82.set_Bias_Blob(biasAllocation); myScript82.set_Kernel_Blob(kernelAllocation); myScript82.set_n_k(n_k_2); myScript82.set_c_k(c_k_8); myScript82.set_h_k(h_k); myScript82.set_w_k(w_k); myScript82.set_pad_x(pad[0]); myScript82.set_pad_y(pad[1]); myScript82.set_stride_x(stride[0]); myScript82.set_stride_y(stride[1]); myScript82.set_group(group); } private void initKernelF8F4(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_8 = c_k; if (c_k % 8 != 0) c_k_8 = c_k + 8 - c_k % 8; int n_k_4 = n_k; if (n_k % 4 != 0) n_k_4 = n_k + 4 - n_k % 4; Allocation kernelAllocation; Allocation biasAllocation; Type kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k_4 * c_k_8 * h_k * w_k / 4); Type biasType = Type.createX(myRS, Element.F32_4(myRS), n_k_4 / 4); float[] kernelMatrix = new float[n_k_4 * h_k * w_k * c_k_8]; float[] biasArray = new float[n_k_4]; int delta_n = (n_k_4 - n_k) / group; for (int i = 0; i < n_k_4; i++) for (int j = 0; j < c_k_8; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k_4 / group - delta_n) && (i < n_k_4 / group)) || (i >= n_k_4 - delta_n)) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = 0; else if (i >= n_k_4 / group) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k_4; i++) { if (((i >= n_k_4 / group - delta_n) && (i < n_k_4 / group)) || (i >= n_k_4 - delta_n)) biasArray[i] = 0; else if (i >= n_k_4 / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript84 = new ScriptC_convRolledInF8OutF4(myRS); myScript84.set_Bias_Blob(biasAllocation); myScript84.set_Kernel_Blob(kernelAllocation); myScript84.set_n_k(n_k_4); myScript84.set_c_k(c_k_8); myScript84.set_h_k(h_k); myScript84.set_w_k(w_k); myScript84.set_pad_x(pad[0]); myScript84.set_pad_y(pad[1]); myScript84.set_stride_x(stride[0]); myScript84.set_stride_y(stride[1]); myScript84.set_group(group); } private void initKernelF8F8(float[][][][] myWeight, float[] myBias) { int n_k = myWeight.length; int c_k = myWeight[0].length; int h_k = myWeight[0][0].length; int w_k = myWeight[0][0][0].length; int c_k_8 = c_k; if (c_k % 8 != 0) c_k_8 = c_k + 8 - c_k % 8; int n_k_8 = n_k; if (n_k % 8 != 0) n_k_8 = n_k + 8 - n_k % 8; Allocation kernelAllocation; Allocation biasAllocation; Type kernelType = Type.createX(myRS, Element.F32_4(myRS), n_k_8 * c_k_8 * h_k * w_k / 4); Type biasType = Type.createX(myRS, Element.F32_4(myRS), n_k_8 / 4); float[] kernelMatrix = new float[n_k_8 * h_k * w_k * c_k_8]; float[] biasArray = new float[n_k_8]; int delta_n = (n_k_8 - n_k) / group; for (int i = 0; i < n_k_8; i++) for (int j = 0; j < c_k_8; j++) for (int k = 0; k < h_k; k++) for (int l = 0; l < w_k; l++) { if (j >= c_k || ((i >= n_k_8 / group - delta_n) && (i < n_k_8 / group)) || (i >= n_k_8 - delta_n)) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = 0; else if (i >= n_k_8 / group) kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i - delta_n][j][k][l]; else kernelMatrix[i * h_k * w_k * c_k_8 + k * w_k * c_k_8 + l * c_k_8 + j] = myWeight[i][j][k][l]; } for (int i = 0; i < n_k_8; i++) { if (((i >= n_k_8 / group - delta_n) && (i < n_k_8 / group)) || (i >= n_k_8 - delta_n)) biasArray[i] = 0; else if (i >= n_k_8 / group) biasArray[i] = myBias[i - delta_n]; else biasArray[i] = myBias[i]; } kernelAllocation = Allocation.createTyped(myRS, kernelType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); kernelAllocation.copyFrom(kernelMatrix); biasAllocation = Allocation.createTyped(myRS, biasType, Allocation.MipmapControl.MIPMAP_NONE, Allocation.USAGE_GRAPHICS_TEXTURE | Allocation.USAGE_SCRIPT); biasAllocation.copyFrom(biasArray); myScript88 = new ScriptC_convRolledInF8OutF8(myRS); myScript88.set_Bias_Blob(biasAllocation); myScript88.set_Kernel_Blob(kernelAllocation); myScript88.set_n_k(n_k_8); myScript88.set_c_k(c_k_8); myScript88.set_h_k(h_k); myScript88.set_w_k(w_k); myScript88.set_pad_x(pad[0]); myScript88.set_pad_y(pad[1]); myScript88.set_stride_x(stride[0]); myScript88.set_stride_y(stride[1]); myScript88.set_group(group); } /////////////////////////////////////////Tuning Function//////////////////////////////////////// private Object tuneFunction(float[][][][] input) { Log.d("CNNdroid", "layers." + name + ": Tuning process is starting..."); long tuneTime = System.currentTimeMillis(); Object[] objects = paramUnpacker.unpackerFunction(paramFilePath, new Class[]{float[][][][].class, float[].class}); float[][][][] myWeight = (float[][][][]) objects[0]; float[] myBias = (float[]) objects[1]; tuneNow = false; long[] time = new long[]{0, 0, 0, 0}; long temp; int c_i = input[0].length; float[][][][] tuneInput = new float[1][c_i][input[0][0].length][input[0][0][0].length]; tuneInput[0] = input[0]; if (c_i < 5) { for (int i = 0; i < 2; i++) { temp = System.currentTimeMillis(); initKernelF4F1(myWeight, myBias); convLayerRolledParInF4OutF1(tuneInput, myWeight, true); time[0] += System.currentTimeMillis() - temp; temp = System.currentTimeMillis(); initKernelF4F2(myWeight, myBias); convLayerRolledParInF4OutF2(tuneInput, myWeight, true); time[1] += System.currentTimeMillis() - temp; temp = System.currentTimeMillis(); initKernelF4F4(myWeight, myBias); convLayerRolledParInF4OutF4(tuneInput, myWeight, true); time[2] += System.currentTimeMillis() - temp; temp = System.currentTimeMillis(); initKernelF4F8(myWeight, myBias); convLayerRolledParInF4OutF8(tuneInput, myWeight, true); time[3] += System.currentTimeMillis() - temp; } int min = 0; for (int i = 0; i < 4; i++) if (time[i] <= time[min]) min = i; algorithm = names[min]; } else { for (int i = 0; i < 2; i++) { temp = System.currentTimeMillis(); initKernelF8F1(myWeight, myBias); convLayerRolledParInF8OutF1(tuneInput, myWeight, true); time[0] += System.currentTimeMillis() - temp; temp = System.currentTimeMillis(); initKernelF8F2(myWeight, myBias); convLayerRolledParInF8OutF2(tuneInput, myWeight, true); time[1] += System.currentTimeMillis() - temp; temp = System.currentTimeMillis(); initKernelF8F4(myWeight, myBias); convLayerRolledParInF8OutF4(tuneInput, myWeight, true); time[2] += System.currentTimeMillis() - temp; temp = System.currentTimeMillis(); initKernelF8F8(myWeight, myBias); convLayerRolledParInF8OutF8(tuneInput, myWeight, true); time[3] += System.currentTimeMillis() - temp; } int min = 0; for (int i = 0; i < 4; i++) if (time[i] <= time[min]) min = i; algorithm = names[min + 4]; } initKernelF4F8(myWeight, myBias); Object output = convLayerRolledParInF4OutF8(input, myWeight, true); writeFile(algorithm); if(loadParamsAtStart) { weight = myWeight; bias = myBias; switch (algorithm) { case "F4F1": initKernelF4F1(weight, bias); break; case "F4F2": initKernelF4F2(weight, bias); break; case "F4F4": initKernelF4F4(weight, bias); break; case "F4F8": initKernelF4F8(weight, bias); break; case "F8F1": initKernelF8F1(weight, bias); break; case "F8F2": initKernelF8F2(weight, bias); break; case "F8F4": initKernelF8F4(weight, bias); break; case "F8F8": initKernelF8F8(weight, bias); break; } } tuneTime = System.currentTimeMillis() - tuneTime; Log.d("CNNdroid", "layers." + name + ": Tuning process finished in " + tuneTime + "ms."); return output; } ////////////////////////////////////////Local Functions///////////////////////////////////////// private Object invokeFunctions(Object input, float[][][][] myWeight, float[] myBias, boolean destroy) { Object output = null; long runTime = System.currentTimeMillis(); if (!parallel) output = convLayerRolledSeq((float[][][][]) input, myWeight, myBias, pad, stride, group); else { if (tuneNow) { output = tuneFunction((float[][][][]) input); } else { switch (algorithm) { case "F4F1": output = convLayerRolledParInF4OutF1((float[][][][]) input, myWeight, destroy); break; case "F4F2": output = convLayerRolledParInF4OutF2((float[][][][]) input, myWeight, destroy); break; case "F4F4": output = convLayerRolledParInF4OutF4((float[][][][]) input, myWeight, destroy); break; case "F4F8": output = convLayerRolledParInF4OutF8((float[][][][]) input, myWeight, destroy); break; case "F8F1": output = convLayerRolledParInF8OutF1((float[][][][]) input, myWeight, destroy); break; case "F8F2": output = convLayerRolledParInF8OutF2((float[][][][]) input, myWeight, destroy); break; case "F8F4": output = convLayerRolledParInF8OutF4((float[][][][]) input, myWeight, destroy); break; case "F8F8": output = convLayerRolledParInF8OutF8((float[][][][]) input, myWeight, destroy); break; } } } runTime = System.currentTimeMillis() - runTime; Log.d("CNNdroid", "layers." + name + ": Computation Run Time = " + String.valueOf(runTime)); return output; } private boolean corrupted(String str) { for (int i = 0 ; i < names.length ; i++) if (str.equals(names[i])) return false; return true; } private void writeFile(String str) { File f = new File(tuningFolder + "/" + name + ".txt"); if(f.exists()) f.delete(); try { f.createNewFile(); FileOutputStream fos = new FileOutputStream(f); fos.write(str.getBytes()); fos.close(); } catch (Exception e) { e.printStackTrace(); } } }