Optimize first layer a bit

kasper93 commented 7 months ago

Would be nice to have more in-depth optimization or even compute shader version for speeed. Anyway this micro-optimization of first layer is low-hanging fruit.

remove 0 from mat4 and use vec4
multiply only one component that we care about
use gather to reduce texture fetches 3 vs 9 (this needs to be behind ifdef, because gather not always is available)

example, not tested, but something like that should produce the same result.

//!DESC ArtCNN C4F8 (Conv-0)
//!HOOK LUMA
//!BIND LUMA
//!SAVE conv2d_tf
//!WIDTH LUMA.w
//!HEIGHT LUMA.h
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w / 1.3 > OUTPUT.h LUMA.h / 1.3 > *
#define go_0(x_off, y_off) (LUMA_gather(LUMA_pos + LUMA_pt * vec2(x_off, y_off), 0))
vec4 hook() {
    vec4 a = go_0(-1.0, -1.0);
    vec4 b = go_0(-1.0, 0.0);
    vec4 c = go_0(-1.0, 1.0);
    vec4 result = vec4(0.05641021, -0.1615453, -0.06533657, 0.096413605) * a.x;
    result += vec4(0.1842147, 0.2938016, 0.147652, -0.22436947) * b.x;
    result += vec4(0.07949546, 0.039763954, -0.1269216, 0.12983432) * c.x;
    result += vec4(0.04739664, -0.10706943, 0.23155588, -0.3707046) * a.y;
    result += vec4(0.42271706, 0.009869351, 0.44073802, 0.7824819) * b.y;
    result += vec4(0.12183003, 0.22918288, -0.18820816, -0.24825077) * c.y;
    result += vec4(-0.012375349, -0.22000322, -0.1642485, 0.06789503) * a.z;
    result += vec4(0.045476228, -0.033663496, 0.14933448, -0.08178548) * b.z;
    result += vec4(-0.1617377, 0.029577361, 0.13827337, -0.06188948) * c.z;
    result += vec4(-0.03365639, -0.0049422677, -0.10187809, -0.025332384);
    return result;
}

Artoriuz commented 7 months ago

The only reason the gen script generates the code with mat4 instead of vec4 is because it's doing the same thing for all layers, and for it to be compatible with the NATIVE or MAIN hook points too (which doesn't matter currently as all variants hook LUMA).

I wouldn't expect that much of a speed-up from these changes alone as there's still a lot of texture fetches and math in the other passes, but I'd definitely be interested in trying to make the shader faster in general after I'm happy with its quality.

Thanks anyway =)

kasper93 commented 7 months ago

The only reason the gen script generates the code with mat4 instead of vec4 is because it's doing the same thing for all layers, and for it to be compatible with the NATIVE or MAIN hook points too (which doesn't matter currently as all variants hook LUMA).

I understand that, script can be smarter with few lines of code.

I wouldn't expect that much of a speed-up from these changes alone as there's still a lot of texture fetches and math in the other passes

Of course, we are tackling only one layer, so it doesn't get much, but still is something. Conv-0 C4F32 before: Conv-0 C4F32 after:

Overall ~0% gain :^) (more for smaller variants ;p)

EDIT: 3% without joking about it ;p

kasper93 commented 6 months ago

I was thinking what to do about the need to split layers into multiple passes and we could do something like below. Instead of writing separate texture in separate passes, write convolution result into one texture with an offset. This is just a poc I hacked around, but does the job. Probably would be better for locality to interleave them instead of offsetting whole planes. There are probably better way to transform that, but without the script to generate whole thing, I was just focusing on one layer only.

diff --git a/ArtCNN_C4F8.glsl b/ArtCNN_C4F8.glsl
index ce1f4c19d..85ce20517 100644
--- a/ArtCNN_C4F8.glsl
+++ b/ArtCNN_C4F8.glsl
@@ -22,61 +22,79 @@

 //!DESC ArtCNN C4F8 (Conv-0)
 //!HOOK LUMA
-//!BIND LUMA
+//!BIND HOOKED
 //!SAVE conv2d_tf
 //!WIDTH LUMA.w
-//!HEIGHT LUMA.h
-//!COMPONENTS 4
-//!WHEN OUTPUT.w LUMA.w / 1.3 > OUTPUT.h LUMA.h / 1.3 > *
-#define input_0(x_off, y_off) (LUMA_texOff(vec2(x_off, y_off)).x)
-vec4 hook() {
-    vec4 result = vec4(0.05606065, -0.16205873, -0.06744548, 0.102317154) * input_0(-1.0, -1.0);
-    result += vec4(0.18186232, 0.29038453, 0.14702061, -0.21449055) * input_0(-1.0, 0.0);
-    result += vec4(0.06950167, 0.03814084, -0.13157128, 0.12728645) * input_0(-1.0, 1.0);
-    result += vec4(0.04409975, -0.0926296, 0.2295825, -0.3638749) * input_0(0.0, -1.0);
-    result += vec4(0.42566386, 0.012273267, 0.444948, 0.7928637) * input_0(0.0, 0.0);
-    result += vec4(0.11805404, 0.22412042, -0.1881149, -0.24530376) * input_0(0.0, 1.0);
-    result += vec4(-0.017324802, -0.21393242, -0.16470031, 0.07029795) * input_0(1.0, -1.0);
-    result += vec4(0.044412095, -0.0391854, 0.14883068, -0.07879292) * input_0(1.0, 0.0);
-    result += vec4(-0.15559202, 0.023217754, 0.13561617, -0.05926498) * input_0(1.0, 1.0);
-    result += vec4(-0.028808497, -0.0019450301, -0.10220286, -0.02527487);
-    return result;
-}
+//!HEIGHT LUMA.h 2 *
+//!COMPUTE 32 32 32 16
+const ivec2 ksize = ivec2(3, 3);
+const ivec2 offset = ksize / 2;
+const ivec2 isize = ivec2(gl_WorkGroupSize) + ksize - 1;
+shared float inp[isize.y][isize.x];
+void hook()
+{
+    ivec2 base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
+    for (uint y = gl_LocalInvocationID.y; y < isize.y; y += gl_WorkGroupSize.y) {
+        for (uint x = gl_LocalInvocationID.x; x < isize.x; x += gl_WorkGroupSize.x)
+            inp[y][x] = texelFetch(HOOKED_raw, base + ivec2(x,y) - offset, 0).x;
+    }

-//!DESC ArtCNN C4F8 (Conv-0)
-//!HOOK LUMA
-//!BIND LUMA
-//!SAVE conv2d_tf1
-//!WIDTH LUMA.w
-//!HEIGHT LUMA.h
-//!COMPONENTS 4
-//!WHEN OUTPUT.w LUMA.w / 1.3 > OUTPUT.h LUMA.h / 1.3 > *
-#define input_0(x_off, y_off) (LUMA_texOff(vec2(x_off, y_off)).x)
-vec4 hook() {
-    vec4 result = vec4(-0.13154525, 0.21037114, 0.011722393, -0.22440465) * input_0(-1.0, -1.0);
-    result += vec4(-0.09714491, 0.026232203, 0.3571225, 0.010353373) * input_0(-1.0, 0.0);
-    result += vec4(-0.106218226, -0.00950985, -0.040633723, 0.045972545) * input_0(-1.0, 1.0);
-    result += vec4(0.28405476, -0.07419559, -0.15994127, 0.09044172) * input_0(0.0, -1.0);
-    result += vec4(0.41553396, -0.21216054, 0.18486117, 0.22783878) * input_0(0.0, 0.0);
-    result += vec4(-0.016779939, 0.2289691, 0.043418467, 0.1276168) * input_0(0.0, 1.0);
-    result += vec4(0.07561329, 0.21008377, -0.19550227, -0.0990998) * input_0(1.0, -1.0);
-    result += vec4(-0.381104, -0.26568213, 0.01505651, -0.14155756) * input_0(1.0, 0.0);
-    result += vec4(-0.013306489, -0.022957662, -0.20943819, -0.023217427) * input_0(1.0, 1.0);
-    result += vec4(0.024538051, -0.022951547, -0.053772032, -0.03247772);
-    return result;
+    barrier();
+
+    const vec4 weights[2][9] = vec4[2][9](
+        vec4[9](
+            vec4(0.05606065, -0.16205873, -0.06744548, 0.102317154),
+            vec4(0.18186232, 0.29038453, 0.14702061, -0.21449055),
+            vec4(0.06950167, 0.03814084, -0.13157128, 0.12728645),
+            vec4(0.04409975, -0.0926296, 0.2295825, -0.3638749),
+            vec4(0.42566386, 0.012273267, 0.444948, 0.7928637),
+            vec4(0.11805404, 0.22412042, -0.1881149, -0.24530376),
+            vec4(-0.017324802, -0.21393242, -0.16470031, 0.07029795),
+            vec4(0.044412095, -0.0391854, 0.14883068, -0.07879292),
+            vec4(-0.15559202, 0.023217754, 0.13561617, -0.05926498)
+        ),
+        vec4[9](
+            vec4(-0.13154525, 0.21037114, 0.011722393, -0.22440465),
+            vec4(-0.09714491, 0.026232203, 0.3571225, 0.010353373),
+            vec4(-0.106218226, -0.00950985, -0.040633723, 0.045972545),
+            vec4(0.28405476, -0.07419559, -0.15994127, 0.09044172),
+            vec4(0.41553396, -0.21216054, 0.18486117, 0.22783878),
+            vec4(-0.016779939, 0.2289691, 0.043418467, 0.1276168),
+            vec4(0.07561329, 0.21008377, -0.19550227, -0.0990998),
+            vec4(-0.381104, -0.26568213, 0.01505651, -0.14155756),
+            vec4(-0.013306489, -0.022957662, -0.20943819, -0.023217427)
+        )
+    );
+    const vec4 biases[2] = vec4[2](
+        vec4(-0.028808497, -0.0019450301, -0.10220286, -0.02527487),
+        vec4(0.024538051, -0.022951547, -0.053772032, -0.03247772)
+    );
+
+    vec4 result0 = biases[0];
+    vec4 result1 = biases[1];
+    for (uint y = 0; y < ksize.y; y++) {
+        for (uint x = 0; x < ksize.x; x++) {
+            result0 += weights[0][x*ksize.y+y] * inp[gl_LocalInvocationID.y+y][gl_LocalInvocationID.x+x];
+            result1 += weights[1][x*ksize.y+y] * inp[gl_LocalInvocationID.y+y][gl_LocalInvocationID.x+x];
+        }
+    }
+
+    imageStore(out_image, ivec2(gl_GlobalInvocationID), result0);
+    uvec2 next = uvec2(gl_GlobalInvocationID);
+    next.y += gl_NumWorkGroups.y * gl_WorkGroupSize.y;
+    imageStore(out_image, ivec2(next), result1);
 }

 //!DESC ArtCNN C4F8 (Conv-1-ReLU)
 //!HOOK LUMA
 //!BIND conv2d_tf
-//!BIND conv2d_tf1
 //!SAVE conv2d_1_tf
 //!WIDTH LUMA.w
 //!HEIGHT LUMA.h
 //!COMPONENTS 4
 //!WHEN OUTPUT.w LUMA.w / 1.3 > OUTPUT.h LUMA.h / 1.3 > *
-#define input_0(x_off, y_off) (conv2d_tf_texOff(vec2(x_off, y_off)))
-#define input_1(x_off, y_off) (conv2d_tf1_texOff(vec2(x_off, y_off)))
+#define input_0(x_off, y_off) (conv2d_tf_tex(vec2(conv2d_tf_pos.x, conv2d_tf_pos.y / 2.0) + conv2d_tf_pt * vec2(x_off, y_off)))
+#define input_1(x_off, y_off) (conv2d_tf_tex(vec2(conv2d_tf_pos.x, 0.5 + conv2d_tf_pos.y / 2.0) + conv2d_tf_pt * vec2(x_off, y_off)))
 vec4 hook() {
     vec4 result = mat4(-0.06594259, 0.08844314, 0.008467282, 0.04818018, -0.073808886, 0.07391563, 0.036448322, -0.22967598, 0.06553942, 0.26806965, -0.13101095, -0.1323187, 0.009232229, 0.33652565, -0.011220336, 0.07229487) * input_0(-1.0, -1.0);
     result += mat4(-0.0025739854, -0.13449624, -0.2867584, 0.2121231, -0.3647844, -0.21884026, 0.018203944, -0.328268, 0.04012776, 0.10124714, -0.10839534, -0.14526665, -0.22862774, -0.07146004, -0.16987513, 0.33927646) * input_0(-1.0, 0.0);
@@ -103,14 +121,13 @@ vec4 hook() {
 //!DESC ArtCNN C4F8 (Conv-1-ReLU)
 //!HOOK LUMA
 //!BIND conv2d_tf
-//!BIND conv2d_tf1
 //!SAVE conv2d_1_tf1
 //!WIDTH LUMA.w
 //!HEIGHT LUMA.h
 //!COMPONENTS 4
 //!WHEN OUTPUT.w LUMA.w / 1.3 > OUTPUT.h LUMA.h / 1.3 > *
-#define input_0(x_off, y_off) (conv2d_tf_texOff(vec2(x_off, y_off)))
-#define input_1(x_off, y_off) (conv2d_tf1_texOff(vec2(x_off, y_off)))
+#define input_0(x_off, y_off) (conv2d_tf_tex(vec2(conv2d_tf_pos.x, conv2d_tf_pos.y / 2.0) + conv2d_tf_pt * vec2(x_off, y_off)))
+#define input_1(x_off, y_off) (conv2d_tf_tex(vec2(conv2d_tf_pos.x, 0.5 + conv2d_tf_pos.y / 2.0) + conv2d_tf_pt * vec2(x_off, y_off)))
 vec4 hook() {
     vec4 result = mat4(0.0021582565, -0.22699508, -0.030275995, -0.14288013, -0.040258944, 0.06518147, -0.039108302, 0.0868597, 0.05119967, 0.23472577, 0.0667181, 0.41770715, 0.16437882, -0.7233007, -0.1005974, 0.13744535) * input_0(-1.0, -1.0);
     result += mat4(0.18252012, -0.035088897, -0.06800512, -0.062662706, -0.31290275, -0.20951368, -0.15828873, -0.38042054, -0.22148538, 0.19237393, -0.07749552, -0.27006102, 0.25913042, -0.04529954, -0.109627336, -0.18988605) * input_0(-1.0, 0.0);
@@ -411,14 +428,13 @@ vec4 hook() {
 //!BIND conv2d_5_tf
 //!BIND conv2d_tf
 //!BIND conv2d_5_tf1
-//!BIND conv2d_tf1
 //!SAVE conv2d_6_tf
 //!WIDTH LUMA.w
 //!HEIGHT LUMA.h
 //!COMPONENTS 4
 //!WHEN OUTPUT.w LUMA.w / 1.3 > OUTPUT.h LUMA.h / 1.3 > *
-#define input_0(x_off, y_off) ((conv2d_5_tf_texOff(vec2(x_off, y_off)))+(conv2d_tf_texOff(vec2(x_off, y_off))))
-#define input_1(x_off, y_off) ((conv2d_5_tf1_texOff(vec2(x_off, y_off)))+(conv2d_tf1_texOff(vec2(x_off, y_off))))
+#define input_0(x_off, y_off) ((conv2d_5_tf_texOff(vec2(x_off, y_off)))+(conv2d_tf_tex(vec2(conv2d_tf_pos.x, conv2d_tf_pos.y / 2.0) + conv2d_tf_pt * vec2(x_off, y_off))))
+#define input_1(x_off, y_off) ((conv2d_5_tf1_texOff(vec2(x_off, y_off)))+(conv2d_tf_tex(vec2(conv2d_tf_pos.x, 0.5 + conv2d_tf_pos.y / 2.0) + conv2d_tf_pt * vec2(x_off, y_off))))
 vec4 hook() {
     vec4 result = mat4(0.04684254, -0.061442196, -0.128818, -0.14015275, -0.024019029, 0.022235809, 0.01950005, 0.021192499, -0.127092, 0.011945784, 0.019604033, 0.032572355, 0.14100526, -0.032037817, -0.012280388, -0.055487834) * input_0(-1.0, -1.0);
     result += mat4(-0.04955578, -0.12209969, 0.041126937, -0.00949945, -0.18128929, 0.12401965, -0.24880213, 0.16046867, 0.19397554, 0.02617344, 0.091755286, 0.039046668, 0.027770039, -0.06447701, 0.025285898, -0.15997665) * input_0(-1.0, 0.0);

Artoriuz commented 5 months ago

Just an update (to the public mostly, kasper already knows this), I've actually written a gen script incorporating the improvements above, but the resulting shader has a few issues:

The edges are not being treated gracefully. This is admittedly a bit difficult to notice when you're in fullscreen and the content matches your aspect ratio, but it becomes very easy to notice with letterboxing.
Performance is better for C4F16 but it's worse for C4F32 and C4F64. Still a bit unsure why but it might have to do with the size of the shared memory.
The actual logic seems a bit fragile and it can break on some resolutions (doesn't seem to happen on common video resolutions though).

Not really worth releasing in its current state but I'm gonna leave it here anyway:

#Generate clever compute shader
import numpy as np

def generate_shader_code(current_layer, previous_layer, channels_in, channels_out):
    passes_in = int(np.ceil(channels_in / 4.0))
    passes_out = int(np.ceil(channels_out / 4.0))

    if previous_layer.name == "input_layer":
        previous_layer.name = "LUMA"

    shader_code = ""
    if any(layer_name in current_layer.name for layer_name in ["conv2d_1", "conv2d_2", "conv2d_3", "conv2d_4"]):
        shader_code += f"//!DESC ArtCNN C4F{filters} ({current_layer.name.title().replace('_', '-')}-ReLU)\n"
    else:
        shader_code += f"//!DESC ArtCNN C4F{filters} ({current_layer.name.title().replace('_', '-')})\n"
    shader_code += f"//!COMPUTE 32 32 32 {int(32 / passes_out)}\n"
    shader_code += f"//!HOOK LUMA\n"

    if previous_layer.name == "LUMA":
        shader_code += f"//!BIND {previous_layer.name}\n"
    elif "add" in previous_layer.name:
        shader_code += f"//!BIND conv2d\n"
        shader_code += f"//!BIND conv2d_5\n"
    elif "conv2d" in current_layer.name:
        shader_code += f"//!BIND {previous_layer.name}\n"
    elif "depth" in current_layer.name:
        shader_code += f"//!BIND {previous_layer.name}\n"

    if "depth" in current_layer.name:
        shader_code += f"//!WIDTH LUMA.w 2.0 *\n"
        shader_code += f"//!HEIGHT LUMA.h 2.0 *\n"
    else:
        shader_code += f"//!SAVE {current_layer.name}\n"
        shader_code += f"//!WIDTH LUMA.w\n"
        shader_code += f"//!HEIGHT LUMA.h {float(passes_out)} *\n"

    shader_code += f"//!COMPONENTS 4\n"
    shader_code += f"//!WHEN OUTPUT.w LUMA.w / 1.3 > OUTPUT.h LUMA.h / 1.3 > *\n\n"

    if "conv2d" in current_layer.name:
        shader_code += "const ivec2 ksize = ivec2(3, 3);\n"
        shader_code += "const ivec2 offset = ksize / 2;\n"
        shader_code += "const ivec2 isize = ivec2(gl_WorkGroupSize) + ksize - 1;\n"
        shader_code += f"shared vec4 inp[isize.x][isize.y][{passes_in}];\n"

        shader_code += "void hook() {\n"
        shader_code += f"    ivec2 base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);\n"
        shader_code += f"    for (uint x = gl_LocalInvocationID.x; x < isize.x; x += gl_WorkGroupSize.x) {{\n"
        shader_code += f"        for (uint y = gl_LocalInvocationID.y; y < isize.y; y += gl_WorkGroupSize.y) {{\n"
        for z in range(passes_in):
            if "add" in previous_layer.name:
                shader_code += f"            inp[x][y][{z}] = conv2d_5_mul * texelFetch(conv2d_5_raw, (base + ivec2(x,y) - offset) + ivec2(0.0, conv2d_5_size.y * {float(z) / float(passes_in)}), 0) + conv2d_mul * texelFetch(conv2d_raw, (base + ivec2(x,y) - offset) + ivec2(0.0, conv2d_size.y * {float(z) / float(passes_in)}), 0);\n"
            else:
                shader_code += f"            inp[x][y][{z}] = {previous_layer.name}_mul * texelFetch({previous_layer.name}_raw, (base + ivec2(x,y) - offset) + ivec2(0.0, {previous_layer.name}_size.y * {float(z) / float(passes_in)}), 0);\n"
        shader_code += f"        }}\n"
        shader_code += f"    }}\n"
        shader_code += f"\n    barrier();\n"
        for pass_idx in range(passes_out):
            biases = current_layer.get_weights()[1][pass_idx*4:(pass_idx+1)*4]
            biases_str = ", ".join(str(w) for w in biases.flatten())
            shader_code += f"    vec4 result{pass_idx} = vec4({biases_str});\n"

            for z in range(passes_in):
                for y in range(0, 3):
                    for x in range(0, 3):
                        weights = current_layer.get_weights()[0][y, x, z*4:(z+1)*4, pass_idx*4:(pass_idx+1)*4]
                        weights_str = ", ".join(str(w) for w in weights.flatten())

                        if weights_str:
                            if previous_layer.name == "LUMA":
                                shader_code += f"    result{pass_idx} += vec4({weights_str}) * inp[gl_LocalInvocationID.x + {x}][gl_LocalInvocationID.y + {y}][{z}].x;\n"
                            else:
                                shader_code += f"    result{pass_idx} += mat4({weights_str}) * inp[gl_LocalInvocationID.x + {x}][gl_LocalInvocationID.y + {y}][{z}];\n"

            if (pass_idx == 0):
                shader_code += f"    ivec2 store_pos = ivec2(gl_GlobalInvocationID);\n"
            else:
                shader_code += f"    store_pos.y += int(gl_NumWorkGroups.y * gl_WorkGroupSize.y);\n"

            if any(layer_name in current_layer.name for layer_name in ["conv2d_1", "conv2d_2", "conv2d_3", "conv2d_4"]):
                shader_code += f"    imageStore(out_image, store_pos, max(result{pass_idx}, vec4(0.0)));\n"
            else:
                shader_code += f"    imageStore(out_image, store_pos, result{pass_idx});\n"

    elif "depth" in current_layer.name:
        shader_code += "void hook() {\n"
        shader_code += f"    vec4 result = vec4(0.0, 0.0, 0.0, 1.0);\n"
        shader_code += f"    vec2 f0 = fract({previous_layer.name}_pos * {previous_layer.name}_size);\n"
        shader_code += f"    ivec2 i0 = ivec2(f0 * vec2(2.0));\n"
        shader_code += f"    result.x = {previous_layer.name}_tex((vec2(0.5) - f0) * {previous_layer.name}_pt + {previous_layer.name}_pos)[i0.y * 2 + i0.x];\n"
        shader_code += f"    imageStore(out_image, ivec2(gl_GlobalInvocationID), clamp(result, 0.0, 1.0));\n"
    shader_code += "}\n\n"

    return shader_code

################################################################################
filters = model.layers[1].filters
shader_code = """// MIT License

// Copyright (c) 2024 Joao Chrisostomo

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

"""
for i in range(1, len(model.layers)):
    if model.layers[i].name == "conv2d":
        shader_code += generate_shader_code(model.layers[i], model.layers[i - 1], 1, filters)
    elif model.layers[i].name == "conv2d_6":
        shader_code += generate_shader_code(model.layers[i], model.layers[i - 1], filters, 4)
    elif "conv2d_" in model.layers[i].name:
        shader_code += generate_shader_code(model.layers[i], model.layers[i - 1], filters, filters)
    elif model.layers[i].name == "depth_to_space":
        shader_code += generate_shader_code(model.layers[i], model.layers[i - 1], 4, 1)

print(shader_code)
with open("meme.glsl", mode="w") as f:
    f.write(shader_code)

I'm also attaching the resulting C4F16 shader here, if anyone reading this is interested in testing it: ArtCNN_C4F16_DEV.txt

Artoriuz commented 4 months ago

I've pushed the "clever compute shaders" to the repo now. They work well enough and are much faster on Vulkan.

Artoriuz / ArtCNN

Optimize first layer a bit #4