rigaya / NVEnc

NVENCによる高速エンコードの性能実験
https://rigaya34589.blog.fc2.com/blog-category-17.html
Other
1.07k stars 112 forks source link

Fail to init ngx vsr filter #600

Closed rainman74 closed 3 months ago

rainman74 commented 3 months ago

Usage: -c hevc --profile main --tier high --level auto --preset quality --qvbr 28 --aq-temporal --aq-strength 0 --lookahead 32 --lookahead-level 0 --tf-level 0 --bframes 3 --ref 3 --bref-mode middle --output-res 1920x-2 --vpp-resize algo=ngx-vsr,vsr-quality=4 --audio-copy

avsw: --sub-copy/--vpp-subburn is set, but no subtitle stream found.
colorspace_conv: failed to build program source.
colorspace_conv: Runtime compilation failed
colorspace_conv: ---------------------------------------
colorspace_conv: --- Source of colorspace_conv ---
colorspace_conv: ---------------------------------------
colorspace_conv:   1 #ifndef _JITIFY_INCLUDE_GUARD_D641424C031A3459
colorspace_conv:   2 #define _JITIFY_INCLUDE_GUARD_D641424C031A3459
colorspace_conv:   3 #ifdef __CUDACC_RTC__
colorspace_conv:   4 #define COLORSPACE_FUNC __device__ __inline__
colorspace_conv:   5 #else
colorspace_conv:   6 #define COLORSPACE_FUNC static
colorspace_conv:   7 #include <cmath>
colorspace_conv:   8 #include <cfloat>
colorspace_conv:   9 _Pragma("warning") (push)
colorspace_conv:  10 _Pragma("warning") (disable: 4819)
colorspace_conv:  11 #include <cuda_runtime.h>
colorspace_conv:  12 _Pragma("warning") (pop)
colorspace_conv:  13 #endif
colorspace_conv:  14
colorspace_conv:  15 typedef float4 LUTVEC;
colorspace_conv:  16
colorspace_conv:  17 #ifndef clamp
colorspace_conv:  18 #define clamp(x, low, high) (((x) <= (high)) ? (((x) >= (low)) ? (x) : (low)) : (high))
colorspace_conv:  19 #endif
colorspace_conv:  20
colorspace_conv:  21
colorspace_conv:  22 const float REC709_ALPHA = 1.09929682680944f;
colorspace_conv:  23 const float REC709_BETA = 0.018053968510807f;
colorspace_conv:  24
colorspace_conv:  25 const float SMPTE_240M_ALPHA = 1.111572195921731f;
colorspace_conv:  26 const float SMPTE_240M_BETA  = 0.022821585529445f;
colorspace_conv:  27
colorspace_conv:  28 // Adjusted for continuity of first derivative.
colorspace_conv:  29 const float SRGB_ALPHA = 1.055010718947587f;
colorspace_conv:  30 const float SRGB_BETA = 0.003041282560128f;
colorspace_conv:  31
colorspace_conv:  32 const float ST2084_M1 = 0.1593017578125f;
colorspace_conv:  33 const float ST2084_M2 = 78.84375f;
colorspace_conv:  34 const float ST2084_C1 = 0.8359375f;
colorspace_conv:  35 const float ST2084_C2 = 18.8515625f;
colorspace_conv:  36 const float ST2084_C3 = 18.6875f;
colorspace_conv:  37
colorspace_conv:  38 const float ARIB_B67_A = 0.17883277f;
colorspace_conv:  39 const float ARIB_B67_B = 0.28466892f;
colorspace_conv:  40 const float ARIB_B67_C = 0.55991073f;
colorspace_conv:  41
colorspace_conv:  42 const float FLOAT_EPS = 1.175494351e-38f;
colorspace_conv:  43
colorspace_conv:  44 const float MP_REF_WHITE = 203.0f;
colorspace_conv:  45 const float MP_REF_WHITE_HLG = 3.17955f;
colorspace_conv:  46
colorspace_conv:  47 // Common constants for SMPTE ST.2084 (HDR)
colorspace_conv:  48 const float PQ_M1 = 2610.0f / 4096.0f * 1.0f / 4.0f;
colorspace_conv:  49 const float PQ_M2 = 2523.0f / 4096.0f * 128.0f;
colorspace_conv:  50 const float PQ_C1 = 3424.0f / 4096.0f;
colorspace_conv:  51 const float PQ_C2 = 2413.0f / 4096.0f * 32.0f;
colorspace_conv:  52 const float PQ_C3 = 2392.0f / 4096.0f * 32.0f;
colorspace_conv:  53
colorspace_conv:  54 // Chosen for compatibility with higher precision REC709_ALPHA/REC709_BETA.
colorspace_conv:  55 // See: ITU-R BT.2390-2 5.3.1
colorspace_conv:  56 const float ST2084_OOTF_SCALE = 59.49080238715383f;
colorspace_conv:  57
colorspace_conv:  58 COLORSPACE_FUNC float rec_709_oetf(float x) {
colorspace_conv:  59     if (x < REC709_BETA)
colorspace_conv:  60         x = x * 4.5f;
colorspace_conv:  61     else
colorspace_conv:  62         x = REC709_ALPHA * powf(x, 0.45f) - (REC709_ALPHA - 1.0f);
colorspace_conv:  63
colorspace_conv:  64     return x;
colorspace_conv:  65 }
colorspace_conv:  66
colorspace_conv:  67 COLORSPACE_FUNC float rec_709_inverse_oetf(float x) {
colorspace_conv:  68     if (x < 4.5f * REC709_BETA)
colorspace_conv:  69         x = x / 4.5f;
colorspace_conv:  70     else
colorspace_conv:  71         x = powf((x + (REC709_ALPHA - 1.0f)) / REC709_ALPHA, 1.0f / 0.45f);
colorspace_conv:  72
colorspace_conv:  73     return x;
colorspace_conv:  74 }
colorspace_conv:  75
colorspace_conv:  76 // Ignore the BT.1886 provisions for limited contrast and assume an ideal CRT.
colorspace_conv:  77 COLORSPACE_FUNC float rec_1886_eotf(float x) {
colorspace_conv:  78     return x < 0.0f ? 0.0f : powf(x, 2.4f);
colorspace_conv:  79 }
colorspace_conv:  80
colorspace_conv:  81 COLORSPACE_FUNC float rec_1886_inverse_eotf(float x) {
colorspace_conv:  82     return x < 0.0f ? 0.0f : powf(x, 1.0f / 2.4f);
colorspace_conv:  83 }
colorspace_conv:  84
colorspace_conv:  85 COLORSPACE_FUNC float ootf_1_2(float x) {
colorspace_conv:  86     return x < 0.0f ? x : powf(x, 1.2f);
colorspace_conv:  87 }
colorspace_conv:  88
colorspace_conv:  89 COLORSPACE_FUNC float inverse_ootf_1_2(float x) {
colorspace_conv:  90     return x < 0.0f ? x : powf(x, 1.0f / 1.2f);
colorspace_conv:  91 }
colorspace_conv:  92
colorspace_conv:  93 COLORSPACE_FUNC float ootf_st2084(float x) {
colorspace_conv:  94     return rec_1886_eotf(rec_709_oetf(x * ST2084_OOTF_SCALE)) / 100.0f;
colorspace_conv:  95 }
colorspace_conv:  96
colorspace_conv:  97 COLORSPACE_FUNC float inverse_ootf_st2084(float x) {
colorspace_conv:  98     return rec_709_inverse_oetf(rec_1886_inverse_eotf(x * 100.0f)) / ST2084_OOTF_SCALE;
colorspace_conv:  99 }
colorspace_conv: 100
colorspace_conv: 101 COLORSPACE_FUNC float log100_oetf(float x) {
colorspace_conv: 102     return x <= 0.01f ? 0.0f : 1.0f + log10f(x) * (1.0f / 2.0f);
colorspace_conv: 103 }
colorspace_conv: 104
colorspace_conv: 105 COLORSPACE_FUNC float log100_inverse_oetf(float x) {
colorspace_conv: 106     return x <= 0.0f ? 0.01f : powf(10.0f, 2 * (x - 1.0f));
colorspace_conv: 107 }
colorspace_conv: 108
colorspace_conv: 109 COLORSPACE_FUNC float log316_oetf(float x) {
colorspace_conv: 110     return x <= 0.00316227766f ? 0.0f : 1.0f + log10f(x) * (1.0f / 2.5f);
colorspace_conv: 111 }
colorspace_conv: 112
colorspace_conv: 113 COLORSPACE_FUNC float log316_inverse_oetf(float x) {
colorspace_conv: 114     return x <= 0.0f ? 0.00316227766f : powf(10.0f, 2.5f * (x - 1.0f));
colorspace_conv: 115 }
colorspace_conv: 116
colorspace_conv: 117 COLORSPACE_FUNC float rec_470m_oetf(float x) {
colorspace_conv: 118     return x < 0.0f ? 0.0f : powf(x, 2.2f);
colorspace_conv: 119 }
colorspace_conv: 120
colorspace_conv: 121 COLORSPACE_FUNC float rec_470m_inverse_oetf(float x) {
colorspace_conv: 122     return x < 0.0f ? 0.0f : powf(x, 1.0f / 2.2f);
colorspace_conv: 123 }
colorspace_conv: 124
colorspace_conv: 125 COLORSPACE_FUNC float rec_470bg_oetf(float x) {
colorspace_conv: 126     return x < 0.0f ? 0.0f : powf(x, 2.8f);
colorspace_conv: 127 }
colorspace_conv: 128
colorspace_conv: 129 COLORSPACE_FUNC float rec_470bg_inverse_oetf(float x) {
colorspace_conv: 130     return x < 0.0f ? 0.0f : powf(x, 1.0f / 2.8f);
colorspace_conv: 131 }
colorspace_conv: 132
colorspace_conv: 133 COLORSPACE_FUNC float smpte_240m_oetf(float x) {
colorspace_conv: 134     if (x < 4.0f * SMPTE_240M_BETA)
colorspace_conv: 135         x = x * (1.0f / 4.0f);
colorspace_conv: 136     else
colorspace_conv: 137         x = powf((x + (SMPTE_240M_ALPHA - 1.0f)) / SMPTE_240M_ALPHA, 1.0f / 0.45f);
colorspace_conv: 138
colorspace_conv: 139     return x;
colorspace_conv: 140 }
colorspace_conv: 141
colorspace_conv: 142 COLORSPACE_FUNC float smpte_240m_inverse_oetf(float x) {
colorspace_conv: 143     if (x < SMPTE_240M_BETA)
colorspace_conv: 144         x = x * 4.0f;
colorspace_conv: 145     else
colorspace_conv: 146         x = SMPTE_240M_ALPHA * powf(x, 0.45f) - (SMPTE_240M_ALPHA - 1.0f);
colorspace_conv: 147
colorspace_conv: 148     return x;
colorspace_conv: 149 }
colorspace_conv: 150
colorspace_conv: 151 COLORSPACE_FUNC float xvycc_oetf(float x) {
colorspace_conv: 152     return copysignf(rec_709_oetf(fabsf(x)), x);
colorspace_conv: 153 }
colorspace_conv: 154
colorspace_conv: 155 float xvycc_inverse_oetf(float x) {
colorspace_conv: 156     return copysignf(rec_709_inverse_oetf(fabsf(x)), x);
colorspace_conv: 157 }
colorspace_conv: 158
colorspace_conv: 159 COLORSPACE_FUNC float arib_b67_oetf(float x) {
colorspace_conv: 160     // Prevent negative pixels from yielding NAN.
colorspace_conv: 161     x = fmaxf(x, 0.0f);
colorspace_conv: 162
colorspace_conv: 163     if (x <= (1.0f / 12.0f))
colorspace_conv: 164         x = sqrtf(3.0f * x);
colorspace_conv: 165     else
colorspace_conv: 166         x = ARIB_B67_A * logf(12.0f * x - ARIB_B67_B) + ARIB_B67_C;
colorspace_conv: 167
colorspace_conv: 168     return x;
colorspace_conv: 169 }
colorspace_conv: 170
colorspace_conv: 171 COLORSPACE_FUNC float arib_b67_inverse_oetf(float x) {
colorspace_conv: 172     // Prevent negative pixels expanding into positive values.
colorspace_conv: 173     x = fmaxf(x, 0.0f);
colorspace_conv: 174
colorspace_conv: 175     if (x <= 0.5f)
colorspace_conv: 176         x = (x * x) * (1.0f / 3.0f);
colorspace_conv: 177     else
colorspace_conv: 178         x = (expf((x - ARIB_B67_C) / ARIB_B67_A) + ARIB_B67_B) * (1.0f / 12.0f);
colorspace_conv: 179
colorspace_conv: 180     return x;
colorspace_conv: 181 }
colorspace_conv: 182
colorspace_conv: 183 COLORSPACE_FUNC float srgb_eotf(float x) {
colorspace_conv: 184     if (x < 12.92f * SRGB_BETA)
colorspace_conv: 185         x *= (1.0f / 12.92f);
colorspace_conv: 186     else
colorspace_conv: 187         x = powf((x + (SRGB_ALPHA - 1.0f)) * (1.0f / SRGB_ALPHA), 2.4f);
colorspace_conv: 188
colorspace_conv: 189     return x;
colorspace_conv: 190 }
colorspace_conv: 191
colorspace_conv: 192 COLORSPACE_FUNC float srgb_inverse_eotf(float x) {
colorspace_conv: 193     if (x < SRGB_BETA)
colorspace_conv: 194         x = x * 12.92f;
colorspace_conv: 195     else
colorspace_conv: 196         x = SRGB_ALPHA * powf(x, 1.0f / 2.4f) - (SRGB_ALPHA - 1.0f);
colorspace_conv: 197
colorspace_conv: 198     return x;
colorspace_conv: 199 }
colorspace_conv: 200
colorspace_conv: 201 // Handle values in the range [0.0-1.0] such that they match a legacy CRT.
colorspace_conv: 202 COLORSPACE_FUNC float xvycc_eotf(float x) {
colorspace_conv: 203     if (x < 0.0f || x > 1.0f)
colorspace_conv: 204         return copysignf(rec_709_inverse_oetf(fabsf(x)), x);
colorspace_conv: 205     else
colorspace_conv: 206         return copysignf(rec_1886_eotf(fabsf(x)), x);
colorspace_conv: 207 }
colorspace_conv: 208
colorspace_conv: 209 COLORSPACE_FUNC float xvycc_inverse_eotf(float x) {
colorspace_conv: 210     if (x < 0.0f || x > 1.0f)
colorspace_conv: 211         return copysignf(rec_709_oetf(fabsf(x)), x);
colorspace_conv: 212     else
colorspace_conv: 213         return copysignf(rec_1886_inverse_eotf(fabsf(x)), x);
colorspace_conv: 214 }
colorspace_conv: 215
colorspace_conv: 216 //pq_space_to_linear
colorspace_conv: 217 COLORSPACE_FUNC float st_2084_eotf(float x) {
colorspace_conv: 218     // Filter negative values to avoid NAN.
colorspace_conv: 219     if (x > 0.0f) {
colorspace_conv: 220         float xpow = powf(x, 1.0f / ST2084_M2);
colorspace_conv: 221         float num = fmaxf(xpow - ST2084_C1, 0.0f);
colorspace_conv: 222         float den = fmaxf(ST2084_C2 - ST2084_C3 * xpow, FLOAT_EPS);
colorspace_conv: 223         x = powf(num / den, 1.0f / ST2084_M1);
colorspace_conv: 224     } else {
colorspace_conv: 225         x = 0.0f;
colorspace_conv: 226     }
colorspace_conv: 227
colorspace_conv: 228     return x;
colorspace_conv: 229 }
colorspace_conv: 230
colorspace_conv: 231 //linear_to_pq_space
colorspace_conv: 232 COLORSPACE_FUNC float st_2084_inverse_eotf(float x) {
colorspace_conv: 233     // Filter negative values to avoid NAN, and also special-case 0 so that (f(g(0)) == 0).
colorspace_conv: 234     if (x > 0.0f) {
colorspace_conv: 235         float xpow = powf(x, ST2084_M1);
colorspace_conv: 236 #if 0
colorspace_conv: 237         // Original formulation from SMPTE ST 2084:2014 publication.
colorspace_conv: 238         float num = ST2084_C1 + ST2084_C2 * xpow;
colorspace_conv: 239         float den = 1.0f + ST2084_C3 * xpow;
colorspace_conv: 240         x = powf(num / den, ST2084_M2);
colorspace_conv: 241 #else
colorspace_conv: 242         // More stable arrangement that avoids some cancellation error.
colorspace_conv: 243         float num = (ST2084_C1 - 1.0f) + (ST2084_C2 - ST2084_C3) * xpow;
colorspace_conv: 244         float den = 1.0f + ST2084_C3 * xpow;
colorspace_conv: 245         x = powf(1.0f + num / den, ST2084_M2);
colorspace_conv: 246 #endif
colorspace_conv: 247     } else {
colorspace_conv: 248         x = 0.0f;
colorspace_conv: 249     }
colorspace_conv: 250
colorspace_conv: 251     return x;
colorspace_conv: 252 }
colorspace_conv: 253
colorspace_conv: 254 // Applies a per-channel correction instead of the iterative method specified in Rec.2100.
colorspace_conv: 255 COLORSPACE_FUNC float arib_b67_eotf(float x) {
colorspace_conv: 256     return ootf_1_2(arib_b67_inverse_oetf(x));
colorspace_conv: 257 }
colorspace_conv: 258
colorspace_conv: 259 COLORSPACE_FUNC float arib_b67_inverse_eotf(float x) {
colorspace_conv: 260     return arib_b67_oetf(inverse_ootf_1_2(x));
colorspace_conv: 261 }
colorspace_conv: 262
colorspace_conv: 263 COLORSPACE_FUNC float st_2084_oetf(float x) {
colorspace_conv: 264     return st_2084_inverse_eotf(ootf_st2084(x));
colorspace_conv: 265 }
colorspace_conv: 266
colorspace_conv: 267 COLORSPACE_FUNC float st_2084_inverse_oetf(float x) {
colorspace_conv: 268     return inverse_ootf_st2084(st_2084_eotf(x));
colorspace_conv: 269 }
colorspace_conv: 270
colorspace_conv: 271 COLORSPACE_FUNC float3 aribB67Ops(float3 v, float kr, float kg, float kb, float scale) {
colorspace_conv: 272     const float gamma = 1.2f;
colorspace_conv: 273     float r = v.x * scale;
colorspace_conv: 274     float g = v.y * scale;
colorspace_conv: 275     float b = v.z * scale;
colorspace_conv: 276
colorspace_conv: 277     float yd = fmaxf(kr * r + kg * g + kb * b, FLOAT_EPS);
colorspace_conv: 278     float ys_inv = powf(yd, (1.0f - gamma) / gamma);
colorspace_conv: 279
colorspace_conv: 280     v.x = arib_b67_oetf(r * ys_inv);
colorspace_conv: 281     v.y = arib_b67_oetf(g * ys_inv);
colorspace_conv: 282     v.z = arib_b67_oetf(b * ys_inv);
colorspace_conv: 283     return v;
colorspace_conv: 284 }
colorspace_conv: 285
colorspace_conv: 286 COLORSPACE_FUNC float3 aribB67InvOps(float3 v, float kr, float kg, float kb, float scale) {
colorspace_conv: 287     const float gamma = 1.2f;
colorspace_conv: 288     float r = v.x;
colorspace_conv: 289     float g = v.y;
colorspace_conv: 290     float b = v.z;
colorspace_conv: 291
colorspace_conv: 292     float ys = fmaxf(kr * r + kg * g + kb * b, FLOAT_EPS);
colorspace_conv: 293     ys = powf(ys, gamma - 1.0f);
colorspace_conv: 294
colorspace_conv: 295     v.x = arib_b67_inverse_oetf(r * ys) * scale;
colorspace_conv: 296     v.y = arib_b67_inverse_oetf(g * ys) * scale;
colorspace_conv: 297     v.z = arib_b67_inverse_oetf(b * ys) * scale;
colorspace_conv: 298     return v;
colorspace_conv: 299 }
colorspace_conv: 300
colorspace_conv: 301 COLORSPACE_FUNC float3 matrix_mul(float m[3][3], float3 v) {
colorspace_conv: 302     float3 ret;
colorspace_conv: 303     ret.x = m[0][0] * v.x + m[0][1] * v.y + m[0][2] * v.z;
colorspace_conv: 304     ret.y = m[1][0] * v.x + m[1][1] * v.y + m[1][2] * v.z;
colorspace_conv: 305     ret.z = m[2][0] * v.x + m[2][1] * v.y + m[2][2] * v.z;
colorspace_conv: 306     return ret;
colorspace_conv: 307 }
colorspace_conv: 308
colorspace_conv: 309 //??: https://gist.github.com/4re/34ccbb95732c1bef47c3d2975ac62395
colorspace_conv: 310 COLORSPACE_FUNC float hable(float x, float A, float B, float C, float D, float E, float F) {
colorspace_conv: 311     return ((x*(A*x+C*B)+D*E) / (x*(A*x+B)+D*F)) - E/F;
colorspace_conv: 312 }
colorspace_conv: 313
colorspace_conv: 314 COLORSPACE_FUNC float hdr2sdr_hable(float x, float source_peak, float ldr_nits, float A, float B, float C, float D, float E, float F) {
colorspace_conv: 315     const float eb = source_peak / ldr_nits;
colorspace_conv: 316     const float t0 = hable(x, A, B, C, D, E, F);
colorspace_conv: 317     const float t1 = hable(eb, A, B, C, D, E, F);
colorspace_conv: 318     return t0 / t1;
colorspace_conv: 319 }
colorspace_conv: 320
colorspace_conv: 321 COLORSPACE_FUNC float hdr2sdr_mobius(float x, float source_peak, float ldr_nits, float t, float peak) {
colorspace_conv: 322     const float eb = source_peak / ldr_nits;
colorspace_conv: 323     peak *= eb;
colorspace_conv: 324     if (x <= t) {
colorspace_conv: 325         return x;
colorspace_conv: 326     }
colorspace_conv: 327
colorspace_conv: 328     float a = -t * t * (peak - 1.0f) / (t * t - 2.0f * t + peak);
colorspace_conv: 329     float b = (t * t - 2.0f * t * peak + peak) / fmaxf(peak - 1.0f, 1e-6f);
colorspace_conv: 330     return (b * b + 2.0f * b * t + t * t) / (b - a) * (x + a) / (x + b);
colorspace_conv: 331 }
colorspace_conv: 332
colorspace_conv: 333 COLORSPACE_FUNC float hdr2sdr_reinhard(float x, float source_peak, float ldr_nits, float offset, float peak) {
colorspace_conv: 334     const float eb = source_peak / ldr_nits;
colorspace_conv: 335     peak *= eb;
colorspace_conv: 336     return x / (x + offset) * (peak + offset) / peak;
colorspace_conv: 337 }
colorspace_conv: 338
colorspace_conv: 339 COLORSPACE_FUNC float linear_to_pq_space(float x) {
colorspace_conv: 340     if (x > 0.0f) {
colorspace_conv: 341         x *= MP_REF_WHITE / 10000.0f;
colorspace_conv: 342         x = powf(x, PQ_M1);
colorspace_conv: 343         x = (PQ_C1 + PQ_C2 * x) / (1.0f + PQ_C3 * x);
colorspace_conv: 344         x = powf(x, PQ_M2);
colorspace_conv: 345         return x;
colorspace_conv: 346     } else {
colorspace_conv: 347         return 0.0f;
colorspace_conv: 348     }
colorspace_conv: 349 }
colorspace_conv: 350
colorspace_conv: 351 COLORSPACE_FUNC float pq_space_to_linear(float x) {
colorspace_conv: 352     if (x > 0.0f) {
colorspace_conv: 353         x = powf(x, 1.0f / PQ_M2);
colorspace_conv: 354         x = fmaxf(x - PQ_C1, 0.0f) / (PQ_C2 - PQ_C3 * x);
colorspace_conv: 355         x = powf(x, 1.0f / PQ_M1);
colorspace_conv: 356         x *= 10000.0f / MP_REF_WHITE;
colorspace_conv: 357         return x;
colorspace_conv: 358     } else {
colorspace_conv: 359         return 0.0f;
colorspace_conv: 360     }
colorspace_conv: 361 }
colorspace_conv: 362
colorspace_conv: 363 COLORSPACE_FUNC float apply_bt2390(float x, const float maxLum) {
colorspace_conv: 364     const float ks = 1.5f * maxLum - 0.5f;
colorspace_conv: 365     float tb = (x - ks) / (1.0f - ks);
colorspace_conv: 366     float tb2 = tb * tb;
colorspace_conv: 367     float tb3 = tb2 * tb;
colorspace_conv: 368     float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks +
colorspace_conv: 369         (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) +
colorspace_conv: 370         (-2.0f * tb3 + 3.0f * tb2) * maxLum;
colorspace_conv: 371     //x = mix(pb, x, lessThan(x, ks));
colorspace_conv: 372     x = (x < ks) ? x : pb;
colorspace_conv: 373     return x;
colorspace_conv: 374 }
colorspace_conv: 375
colorspace_conv: 376 COLORSPACE_FUNC float mix(float x, float y, float a) {
colorspace_conv: 377     a = (a < 0.0f) ? 0.0f : a;
colorspace_conv: 378     a = (a > 1.0f) ? 1.0f : a;
colorspace_conv: 379     return (x) * (1.0f - (a)) + (y) * (a);
colorspace_conv: 380 }
colorspace_conv: 381
colorspace_conv: 382 COLORSPACE_FUNC float lut3d_linear_interp(float v0, float v1, float a) {
colorspace_conv: 383     return v0 + (v1 - v0) * a;
colorspace_conv: 384 }
colorspace_conv: 385
colorspace_conv: 386 COLORSPACE_FUNC float3 lut3d_linear_interp(float3 v0, float3 v1, float a) {
colorspace_conv: 387     float3 r;
colorspace_conv: 388     r.x = lut3d_linear_interp(v0.x, v1.x, a);
colorspace_conv: 389     r.y = lut3d_linear_interp(v0.y, v1.y, a);
colorspace_conv: 390     r.z = lut3d_linear_interp(v0.z, v1.z, a);
colorspace_conv: 391     return r;
colorspace_conv: 392 }
colorspace_conv: 393
colorspace_conv: 394 COLORSPACE_FUNC int lut3d_prev_idx(float x) {
colorspace_conv: 395     return (int)x;
colorspace_conv: 396 }
colorspace_conv: 397
colorspace_conv: 398 COLORSPACE_FUNC int lut3d_near_idx(float x) {
colorspace_conv: 399     return (int)(x + 0.5f);
colorspace_conv: 400 }
colorspace_conv: 401
colorspace_conv: 402 COLORSPACE_FUNC int lut3d_next_idx(float x, int size) {
colorspace_conv: 403     int next = lut3d_prev_idx(x) + 1;
colorspace_conv: 404     return (next >= size) ? size - 1 : next;
colorspace_conv: 405 }
colorspace_conv: 406
colorspace_conv: 407 COLORSPACE_FUNC float lut3d_prelut(const float s, const int idx, const int size,
colorspace_conv: 408     const float prelutmin[3], const float prelutscale[3], const float *__restrict__ prelut) {
colorspace_conv: 409     const float x = clamp((s - prelutmin[idx]) * prelutscale[idx], 0.0f, (float)(size - 1));
colorspace_conv: 410     const float c0 = prelut[idx * size + lut3d_prev_idx(x)];
colorspace_conv: 411     const float c1 = prelut[idx * size + lut3d_next_idx(x, size)];
colorspace_conv: 412     return lut3d_linear_interp(c0, c1, x - lut3d_prev_idx(x));
colorspace_conv: 413 }
colorspace_conv: 414
colorspace_conv: 415 COLORSPACE_FUNC float3 lut3d_prelut(const float3 in, const int size,
colorspace_conv: 416     const float prelutmin[3], const float prelutscale[3], const float *__restrict__ prelut) {
colorspace_conv: 417     float3 out;
colorspace_conv: 418     out.x = lut3d_prelut(in.x, 0, size, prelutmin, prelutscale, prelut);
colorspace_conv: 419     out.y = lut3d_prelut(in.y, 1, size, prelutmin, prelutscale, prelut);
colorspace_conv: 420     out.z = lut3d_prelut(in.z, 2, size, prelutmin, prelutscale, prelut);
colorspace_conv: 421     return out;
colorspace_conv: 422 }
colorspace_conv: 423
colorspace_conv: 424 COLORSPACE_FUNC float3 lut3d_get_table(const LUTVEC *__restrict__ lut, const int x, const int y, const int z, const int lutSize0, const int lutSize01) {
colorspace_conv: 425     LUTVEC val = lut[x * lutSize01 + y * lutSize0 + z];
colorspace_conv: 426     float3 out;
colorspace_conv: 427     out.x = val.x;
colorspace_conv: 428     out.y = val.y;
colorspace_conv: 429     out.z = val.z;
colorspace_conv: 430     return out;
colorspace_conv: 431 }
colorspace_conv: 432
colorspace_conv: 433 COLORSPACE_FUNC float3 lut3d_interp_nearest(float3 in, const LUTVEC *__restrict__ lut, const int lutSize0, const int lutSize01) {
colorspace_conv: 434     return lut3d_get_table(lut, lut3d_near_idx(in.x), lut3d_near_idx(in.y), lut3d_near_idx(in.z), lutSize0, lutSize01);
colorspace_conv: 435 }
colorspace_conv: 436
colorspace_conv: 437 //??: https://en.wikipedia.org/wiki/Trilinear_interpolation
colorspace_conv: 438 COLORSPACE_FUNC float3 lut3d_interp_trilinear(float3 in, const LUTVEC *__restrict__ lut, const int lutSize0, const int lutSize01) {
colorspace_conv: 439     const int x0 = lut3d_prev_idx(in.x);
colorspace_conv: 440     const int x1 = lut3d_next_idx(in.x, lutSize0);
colorspace_conv: 441     const int y0 = lut3d_prev_idx(in.y);
colorspace_conv: 442     const int y1 = lut3d_next_idx(in.y, lutSize0);
colorspace_conv: 443     const int z0 = lut3d_prev_idx(in.z);
colorspace_conv: 444     const int z1 = lut3d_next_idx(in.z, lutSize0);
colorspace_conv: 445     const float scalex = in.x - x0;
colorspace_conv: 446     const float scaley = in.y - y0;
colorspace_conv: 447     const float scalez = in.z - z0;
colorspace_conv: 448     const float3 c000  = lut3d_get_table(lut, x0, y0, z0, lutSize0, lutSize01);
colorspace_conv: 449     const float3 c001  = lut3d_get_table(lut, x0, y0, z1, lutSize0, lutSize01);
colorspace_conv: 450     const float3 c010  = lut3d_get_table(lut, x0, y1, z0, lutSize0, lutSize01);
colorspace_conv: 451     const float3 c011  = lut3d_get_table(lut, x0, y1, z1, lutSize0, lutSize01);
colorspace_conv: 452     const float3 c100  = lut3d_get_table(lut, x1, y0, z0, lutSize0, lutSize01);
colorspace_conv: 453     const float3 c101  = lut3d_get_table(lut, x1, y0, z1, lutSize0, lutSize01);
colorspace_conv: 454     const float3 c110  = lut3d_get_table(lut, x1, y1, z0, lutSize0, lutSize01);
colorspace_conv: 455     const float3 c111  = lut3d_get_table(lut, x1, y1, z1, lutSize0, lutSize01);
colorspace_conv: 456     const float3 c00   = lut3d_linear_interp(c000, c100, scalex);
colorspace_conv: 457     const float3 c10   = lut3d_linear_interp(c010, c110, scalex);
colorspace_conv: 458     const float3 c01   = lut3d_linear_interp(c001, c101, scalex);
colorspace_conv: 459     const float3 c11   = lut3d_linear_interp(c011, c111, scalex);
colorspace_conv: 460     const float3 c0    = lut3d_linear_interp(c00,  c10,  scaley);
colorspace_conv: 461     const float3 c1    = lut3d_linear_interp(c01,  c11,  scaley);
colorspace_conv: 462     const float3 c     = lut3d_linear_interp(c0,   c1,   scalez);
colorspace_conv: 463     return c;
colorspace_conv: 464 }
colorspace_conv: 465
colorspace_conv: 466 //??: http://www.filmlight.ltd.uk/pdf/whitepapers/FL-TL-TN-0057-SoftwareLib.pdf
colorspace_conv: 467 COLORSPACE_FUNC float3 lut3d_interp_tetrahedral(float3 in, const LUTVEC *__restrict__ lut, const int lutSize0, const int lutSize01) {
colorspace_conv: 468     const int x0 = lut3d_prev_idx(in.x);
colorspace_conv: 469     const int x1 = lut3d_next_idx(in.x, lutSize0);
colorspace_conv: 470     const int y0 = lut3d_prev_idx(in.y);
colorspace_conv: 471     const int y1 = lut3d_next_idx(in.y, lutSize0);
colorspace_conv: 472     const int z0 = lut3d_prev_idx(in.z);
colorspace_conv: 473     const int z1 = lut3d_next_idx(in.z, lutSize0);
colorspace_conv: 474     const float scalex = in.x - x0;
colorspace_conv: 475     const float scaley = in.y - y0;
colorspace_conv: 476     const float scalez = in.z - z0;
colorspace_conv: 477     float scale0, scale1, scale2;
colorspace_conv: 478     int xA, yA, zA, xB, yB, zB;
colorspace_conv: 479     if (scalex > scaley) {
colorspace_conv: 480         if (scaley > scalez) {
colorspace_conv: 481             scale0 = scalex;
colorspace_conv: 482             scale1 = scaley;
colorspace_conv: 483             scale2 = scalez;
colorspace_conv: 484             xA = x1; yA = y0; zA = z0;
colorspace_conv: 485             xB = x1; yB = y1; zB = z0;
colorspace_conv: 486         } else if (scalex > scalez) {
colorspace_conv: 487             scale0 = scalex;
colorspace_conv: 488             scale1 = scalez;
colorspace_conv: 489             scale2 = scaley;
colorspace_conv: 490             xA = x1; yA = y0; zA = z0;
colorspace_conv: 491             xB = x1; yB = y0; zB = z1;
colorspace_conv: 492         } else {
colorspace_conv: 493             scale0 = scalez;
colorspace_conv: 494             scale1 = scalex;
colorspace_conv: 495             scale2 = scaley;
colorspace_conv: 496             xA = x0; yA = y0; zA = z1;
colorspace_conv: 497             xB = x1; yB = y0; zB = z1;
colorspace_conv: 498         }
colorspace_conv: 499     } else {
colorspace_conv: 500         if (scalez > scaley) {
colorspace_conv: 501             scale0 = scalez;
colorspace_conv: 502             scale1 = scaley;
colorspace_conv: 503             scale2 = scalex;
colorspace_conv: 504             xA = x0; yA = y0; zA = z1;
colorspace_conv: 505             xB = x0; yB = y1; zB = z1;
colorspace_conv: 506         } else if (scalez > scalex) {
colorspace_conv: 507             scale0 = scaley;
colorspace_conv: 508             scale1 = scalez;
colorspace_conv: 509             scale2 = scalex;
colorspace_conv: 510             xA = x0; yA = y1; zA = z0;
colorspace_conv: 511             xB = x0; yB = y1; zB = z1;
colorspace_conv: 512         } else {
colorspace_conv: 513             scale0 = scaley;
colorspace_conv: 514             scale1 = scalex;
colorspace_conv: 515             scale2 = scalez;
colorspace_conv: 516             xA = x0; yA = y1; zA = z0;
colorspace_conv: 517             xB = x1; yB = y1; zB = z0;
colorspace_conv: 518         }
colorspace_conv: 519     }
colorspace_conv: 520     const float3 c000 = lut3d_get_table(lut, x0, y0, z0, lutSize0, lutSize01);
colorspace_conv: 521     const float3 c111 = lut3d_get_table(lut, x1, y1, z1, lutSize0, lutSize01);
colorspace_conv: 522     const float3 cA   = lut3d_get_table(lut, xA, yA, zA, lutSize0, lutSize01);
colorspace_conv: 523     const float3 cB   = lut3d_get_table(lut, xB, yB, zB, lutSize0, lutSize01);
colorspace_conv: 524     const float  s0   = 1.0f   - scale0;
colorspace_conv: 525     const float  s1   = scale0 - scale1;
colorspace_conv: 526     const float  s2   = scale1 - scale2;
colorspace_conv: 527     const float  s3   = scale2;
colorspace_conv: 528     float3 c;
colorspace_conv: 529     c.x = s0 * c000.x + s1 * cA.x + s2 * cB.x + s3 * c111.x;
colorspace_conv: 530     c.y = s0 * c000.y + s1 * cA.y + s2 * cB.y + s3 * c111.y;
colorspace_conv: 531     c.z = s0 * c000.z + s1 * cA.z + s2 * cB.z + s3 * c111.z;
colorspace_conv: 532     return c;
colorspace_conv: 533 }
colorspace_conv: 534
colorspace_conv: 535 COLORSPACE_FUNC float3 lut3d_interp_pyramid(float3 in, const LUTVEC *lut, const int lutSize0, const int lutSize01) {
colorspace_conv: 536     const int x0 = lut3d_prev_idx(in.x);
colorspace_conv: 537     const int x1 = lut3d_next_idx(in.x, lutSize0);
colorspace_conv: 538     const int y0 = lut3d_prev_idx(in.y);
colorspace_conv: 539     const int y1 = lut3d_next_idx(in.y, lutSize0);
colorspace_conv: 540     const int z0 = lut3d_prev_idx(in.z);
colorspace_conv: 541     const int z1 = lut3d_next_idx(in.z, lutSize0);
colorspace_conv: 542     const float scalex = in.x - x0;
colorspace_conv: 543     const float scaley = in.y - y0;
colorspace_conv: 544     const float scalez = in.z - z0;
colorspace_conv: 545
colorspace_conv: 546     float scale0, scale1, scale2;
colorspace_conv: 547     int xA, yA, zA, xB, yB, zB, xC, yC, zC;
colorspace_conv: 548
colorspace_conv: 549     if (scaley > scalex && scalez > scalex) {
colorspace_conv: 550         xA = x0; yA = y0; zA = z1;
colorspace_conv: 551         xB = x0; yB = y1; zB = z0;
colorspace_conv: 552         xC = x0; yC = y1; zC = z1;
colorspace_conv: 553         scale0 = scaley;
colorspace_conv: 554         scale1 = scalez;
colorspace_conv: 555         scale2 = scalex;
colorspace_conv: 556     } else if (scalex > scaley && scalez > scaley) {
colorspace_conv: 557         xA = x0; yA = y0; zA = z1;
colorspace_conv: 558         xB = x1; yB = y0; zB = z0;
colorspace_conv: 559         xC = x1; yC = y0; zC = z1;
colorspace_conv: 560         scale0 = scalex;
colorspace_conv: 561         scale1 = scalez;
colorspace_conv: 562         scale2 = scaley;
colorspace_conv: 563     } else {
colorspace_conv: 564         xA = x0; yA = y1; zA = z0;
colorspace_conv: 565         xB = x1; yB = y0; zB = z0;
colorspace_conv: 566         xC = x1; yC = y1; zC = z0;
colorspace_conv: 567         scale0 = scalex;
colorspace_conv: 568         scale1 = scaley;
colorspace_conv: 569         scale2 = scalez;
colorspace_conv: 570     }
colorspace_conv: 571     const float3 c000 = lut3d_get_table(lut, x0, y0, z0, lutSize0, lutSize01);
colorspace_conv: 572     const float3 c111 = lut3d_get_table(lut, x1, y1, z1, lutSize0, lutSize01);
colorspace_conv: 573     const float3 cA   = lut3d_get_table(lut, xA, yA, zA, lutSize0, lutSize01);
colorspace_conv: 574     const float3 cB   = lut3d_get_table(lut, xB, yB, zB, lutSize0, lutSize01);
colorspace_conv: 575     const float3 cC   = lut3d_get_table(lut, xC, yC, zC, lutSize0, lutSize01);
colorspace_conv: 576     float3 c;
colorspace_conv: 577     c.x = c000.x + (cB.x - c000.x) * scale0 + (c111.x - cC.x) * scale2 + (cA.x - c000.x) * scale1 + (cC.x - cA.x - cB.x + c000.x) * scale0 * scale1;
colorspace_conv: 578     c.y = c000.y + (cB.y - c000.y) * scale0 + (c111.y - cC.y) * scale2 + (cA.y - c000.y) * scale1 + (cC.y - cA.y - cB.y + c000.y) * scale0 * scale1;
colorspace_conv: 579     c.z = c000.z + (cB.z - c000.z) * scale0 + (c111.z - cC.z) * scale2 + (cA.z - c000.z) * scale1 + (cC.z - cA.z - cB.z + c000.z) * scale0 * scale1;
colorspace_conv: 580     return c;
colorspace_conv: 581 }
colorspace_conv: 582
colorspace_conv: 583 COLORSPACE_FUNC float3 lut3d_interp_prism(float3 in, const LUTVEC *lut, const int lutSize0, const int lutSize01) {
colorspace_conv: 584     const int x0 = lut3d_prev_idx(in.x);
colorspace_conv: 585     const int x1 = lut3d_next_idx(in.x, lutSize0);
colorspace_conv: 586     const int y0 = lut3d_prev_idx(in.y);
colorspace_conv: 587     const int y1 = lut3d_next_idx(in.y, lutSize0);
colorspace_conv: 588     const int z0 = lut3d_prev_idx(in.z);
colorspace_conv: 589     const int z1 = lut3d_next_idx(in.z, lutSize0);
colorspace_conv: 590     const float scalex = in.x - x0;
colorspace_conv: 591     const float scaley = in.y - y0;
colorspace_conv: 592     const float scalez = in.z - z0;
colorspace_conv: 593     float scale0, scale2;
colorspace_conv: 594     int xA, yA, zA, xB, yB, zB;
colorspace_conv: 595
colorspace_conv: 596     if (scalez > scalex) {
colorspace_conv: 597         scale0 = scalez;
colorspace_conv: 598         scale2 = scalex;
colorspace_conv: 599         xA = x0; yA = y1; zA = z1;
colorspace_conv: 600         xB = x0; yB = y0; zB = z1;
colorspace_conv: 601     } else {
colorspace_conv: 602         scale0 = scalex;
colorspace_conv: 603         scale2 = scalez;
colorspace_conv: 604         xA = x1; yA = y1; zA = z0;
colorspace_conv: 605         xB = x1; yB = y0; zB = z0;
colorspace_conv: 606     }
colorspace_conv: 607     const float3 c000 = lut3d_get_table(lut, x0, y0, z0, lutSize0, lutSize01);
colorspace_conv: 608     const float3 c010 = lut3d_get_table(lut, x0, y1, z0, lutSize0, lutSize01);
colorspace_conv: 609     const float3 c101 = lut3d_get_table(lut, x1, y0, z1, lutSize0, lutSize01);
colorspace_conv: 610     const float3 c111 = lut3d_get_table(lut, x1, y1, z1, lutSize0, lutSize01);
colorspace_conv: 611     const float3 cA   = lut3d_get_table(lut, xA, yA, zA, lutSize0, lutSize01);
colorspace_conv: 612     const float3 cB   = lut3d_get_table(lut, xB, yB, zB, lutSize0, lutSize01);
colorspace_conv: 613     float3 c;
colorspace_conv: 614     c.x = c000.x + (cB.x - c000.x) * scale0 + (c101.x - cB.x) * scale2 + (c010.x - c000.x) * scaley + (c000.x - c010.x - cB.x + cA.x) * scale0 * scaley + (cB.x - cA.x - c101.x + c111.x) * scale2 * scaley;
colorspace_conv: 615     c.y = c000.y + (cB.y - c000.y) * scale0 + (c101.y - cB.y) * scale2 + (c010.y - c000.y) * scaley + (c000.y - c010.y - cB.y + cA.y) * scale0 * scaley + (cB.y - cA.y - c101.y + c111.y) * scale2 * scaley;
colorspace_conv: 616     c.z = c000.z + (cB.z - c000.z) * scale0 + (c101.z - cB.z) * scale2 + (c010.z - c000.z) * scaley + (c000.z - c010.z - cB.z + cA.z) * scale0 * scaley + (cB.z - cA.z - c101.z + c111.z) * scale2 * scaley;
colorspace_conv: 617     return c;
colorspace_conv: 618 }
colorspace_conv: 619
colorspace_conv: 620 struct RGYColorspaceDevParams {
colorspace_conv: 621     int lut_offset;
colorspace_conv: 622     int prelut_offset;
colorspace_conv: 623     // ???offset?????????
colorspace_conv: 624     // ???????????????????
colorspace_conv: 625 };
colorspace_conv: 626
colorspace_conv: 627 float *getDevParamsPrelut(void *__restrict__ ptr) {
colorspace_conv: 628     return (float *)((char *)ptr + ((RGYColorspaceDevParams *)ptr)->prelut_offset);
colorspace_conv: 629 }
colorspace_conv: 630
colorspace_conv: 631 const float *getDevParamsPrelut(const void *__restrict__ ptr) {
colorspace_conv: 632     return (const float *)((const char *)ptr + ((RGYColorspaceDevParams *)ptr)->prelut_offset);
colorspace_conv: 633 }
colorspace_conv: 634
colorspace_conv: 635 LUTVEC *getDevParamsLut(void *__restrict__ ptr) {
colorspace_conv: 636     return (LUTVEC *)((char *)ptr + ((RGYColorspaceDevParams *)ptr)->lut_offset);
colorspace_conv: 637 }
colorspace_conv: 638
colorspace_conv: 639 const LUTVEC *getDevParamsLut(const void *__restrict__ ptr) {
colorspace_conv: 640     return (const LUTVEC *)((const char *)ptr + ((RGYColorspaceDevParams *)ptr)->lut_offset);
colorspace_conv: 641 }
colorspace_conv: 642
colorspace_conv: 643
colorspace_conv: 644 #include <stdint.h>
colorspace_conv: 645
colorspace_conv: 646 __device__ __inline__
colorspace_conv: 647 float3 convert_colorspace_custom(float3 x, const RGYColorspaceDevParams *__restrict__ params) {
colorspace_conv: 648
colorspace_conv: 649     { //range int->float
colorspace_conv: 650         const float range_y   = 4,5662100456621002e-03f;
colorspace_conv: 651         const float offset_y  = -7,3059360730593603e-02f;
colorspace_conv: 652         const float range_uv  = 4,4642857142857140e-03f;
colorspace_conv: 653         const float offset_uv = -5,7142857142857140e-01f;
colorspace_conv: 654         x.x = x.x * range_y  + offset_y;
colorspace_conv: 655         x.y = x.y * range_uv + offset_uv;
colorspace_conv: 656         x.z = x.z * range_uv + offset_uv;
colorspace_conv: 657     }
colorspace_conv: 658
colorspace_conv: 659     {
colorspace_conv: 660         float m[3][3] = {
colorspace_conv: 661             { 1,0000000000000000e+00f, 0,0000000000000000e+00f, 1,4020000000000004e+00f },
colorspace_conv: 662             { 1,0000000000000002e+00f, -3,4413628620102216e-01f, -7,1413628620102221e-01f },
colorspace_conv: 663             { 1,0000000000000002e+00f, 1,7720000000000000e+00f, 0,0000000000000000e+00f }
colorspace_conv: 664         };
colorspace_conv: 665         x = matrix_mul(m, x);
colorspace_conv: 666     }
colorspace_conv: 667
colorspace_conv: 668     return x;
colorspace_conv: 669 }
colorspace_conv: 670
colorspace_conv: 671 static const int PIX_PER_THREAD = 4;
colorspace_conv: 672
colorspace_conv: 673 template<typename T> __device__ __inline__ T toPix(float x) { return (T)clamp((x) + 0.5f, 0.0f, (1<<(sizeof(T)*8)) - 0.5f); }
colorspace_conv: 674 template<> __device__ __inline__ float  toPix<float> (float x) { return x; }
colorspace_conv: 675
colorspace_conv: 676 template<typename TypeOut, typename TypeIn>
colorspace_conv: 677 __global__ void kernel_filter(
colorspace_conv: 678     uint8_t *__restrict__ pDstY, uint8_t *__restrict__ pDstU, uint8_t *__restrict__ pDstV,
colorspace_conv: 679     const int dstPitch, const int dstWidth, const int dstHeight,
colorspace_conv: 680     const uint8_t *__restrict__ pSrcY, const uint8_t *__restrict__ pSrcU, const uint8_t *__restrict__ pSrcV,
colorspace_conv: 681     const int srcPitch, const int srcWidth, const int srcHeight, bool srcInterlaced,
colorspace_conv: 682     const RGYColorspaceDevParams *__restrict__ params) {
colorspace_conv: 683     const int ix = (blockIdx.x * blockDim.x + threadIdx.x) * PIX_PER_THREAD;
colorspace_conv: 684     const int iy =  blockIdx.y * blockDim.y + threadIdx.y;
colorspace_conv: 685
colorspace_conv: 686     struct __align__(sizeof(TypeIn) * 4) TypeIn4 {
colorspace_conv: 687         TypeIn x, y, z, w;
colorspace_conv: 688     };
colorspace_conv: 689
colorspace_conv: 690     struct __align__(sizeof(TypeOut) * 4) TypeOut4 {
colorspace_conv: 691         TypeOut x, y, z, w;
colorspace_conv: 692     };
colorspace_conv: 693
colorspace_conv: 694     if (ix < dstWidth && iy < dstHeight) {
colorspace_conv: 695
colorspace_conv: 696         TypeIn4 srcY = *(TypeIn4 *)(pSrcY + iy * srcPitch + ix * sizeof(TypeIn));
colorspace_conv: 697         TypeIn4 srcU = *(TypeIn4 *)(pSrcU + iy * srcPitch + ix * sizeof(TypeIn));
colorspace_conv: 698         TypeIn4 srcV = *(TypeIn4 *)(pSrcV + iy * srcPitch + ix * sizeof(TypeIn));
colorspace_conv: 699
colorspace_conv: 700         float3 pix0 = make_float3((float)srcY.x, (float)srcU.x, (float)srcV.x);
colorspace_conv: 701         float3 pix1 = make_float3((float)srcY.y, (float)srcU.y, (float)srcV.y);
colorspace_conv: 702         float3 pix2 = make_float3((float)srcY.z, (float)srcU.z, (float)srcV.z);
colorspace_conv: 703         float3 pix3 = make_float3((float)srcY.w, (float)srcU.w, (float)srcV.w);
colorspace_conv: 704
colorspace_conv: 705         pix0 = convert_colorspace_custom(pix0, params);
colorspace_conv: 706         pix1 = convert_colorspace_custom(pix1, params);
colorspace_conv: 707         pix2 = convert_colorspace_custom(pix2, params);
colorspace_conv: 708         pix3 = convert_colorspace_custom(pix3, params);
colorspace_conv: 709
colorspace_conv: 710         TypeOut4 dstY, dstU, dstV;
colorspace_conv: 711         dstY.x = toPix<TypeOut>(pix0.x); dstU.x = toPix<TypeOut>(pix0.y); dstV.x = toPix<TypeOut>(pix0.z);
colorspace_conv: 712         dstY.y = toPix<TypeOut>(pix1.x); dstU.y = toPix<TypeOut>(pix1.y); dstV.y = toPix<TypeOut>(pix1.z);
colorspace_conv: 713         dstY.z = toPix<TypeOut>(pix2.x); dstU.z = toPix<TypeOut>(pix2.y); dstV.z = toPix<TypeOut>(pix2.z);
colorspace_conv: 714         dstY.w = toPix<TypeOut>(pix3.x); dstU.w = toPix<TypeOut>(pix3.y); dstV.w = toPix<TypeOut>(pix3.z);
colorspace_conv: 715
colorspace_conv: 716         TypeOut4 *ptrDstY = (TypeOut4 *)(pDstY + iy * dstPitch + ix * sizeof(TypeOut));
colorspace_conv: 717         TypeOut4 *ptrDstU = (TypeOut4 *)(pDstU + iy * dstPitch + ix * sizeof(TypeOut));
colorspace_conv: 718         TypeOut4 *ptrDstV = (TypeOut4 *)(pDstV + iy * dstPitch + ix * sizeof(TypeOut));
colorspace_conv: 719
colorspace_conv: 720         ptrDstY[0] = dstY;
colorspace_conv: 721         ptrDstU[0] = dstU;
colorspace_conv: 722         ptrDstV[0] = dstV;
colorspace_conv: 723     }
colorspace_conv: 724 };
colorspace_conv: 725
colorspace_conv: 726 #endif // _JITIFY_INCLUDE_GUARD_D641424C031A3459
colorspace_conv: ---------------------------------------
colorspace_conv: Compiler options: --use_fast_math -arch=compute_75
colorspace_conv: ---------------------------------------------------
colorspace_conv: --- JIT compile log for colorspace_conv ---
colorspace_conv: ---------------------------------------------------
colorspace_conv: colorspace_conv(650): error: expected an identifier
colorspace_conv: colorspace_conv(651): error: expected an identifier
colorspace_conv: colorspace_conv(652): error: expected an identifier
colorspace_conv: colorspace_conv(653): error: expected an identifier
colorspace_conv: colorspace_conv(661): error: too many initializer values
colorspace_conv: colorspace_conv(662): error: too many initializer values
colorspace_conv: colorspace_conv(663): error: too many initializer values
colorspace_conv: colorspace_conv(45): warning: variable "MP_REF_WHITE_HLG" was declared but never referenced
colorspace_conv: colorspace_conv(671): warning: variable "PIX_PER_THREAD" was declared but never referenced
colorspace_conv: 7 errors detected in the compilation of "colorspace_conv".
colorspace: failed to setup custom filter: error in cuda..
resize: Failed to init ngx vsr filter.

Testfile: https://github.com/rigaya/NVEnc/assets/16306963/69ccef1c-adfe-4c37-91ef-91ff1d424de9

Update: It is not due to the test file, I also have the error with all other videos I have tried.

rigaya commented 3 months ago

Seems to be something related to locale settings. Would you please try with the test build below? https://nightly.link/rigaya/NVEnc/actions/runs/9723978454/NVEncC_release_r2923_x64.zip

rainman74 commented 3 months ago

Seems to be something related to locale settings. Would you please try with the test build below? https://nightly.link/rigaya/NVEnc/actions/runs/9723978454/NVEncC_release_r2923_x64.zip

Yes! This version works.

avsw: --sub-copy/--vpp-subburn is set, but no subtitle stream found.
NVEncC (x64) 7.56 (r2923) by rigaya, Jun 29 2024 12:17:08 (VC 1929/Win)
OS Version     Windows 11 x64 (26100) [UTF-8]
CPU            11th Gen Intel Core i9-11900K @ 3.50GHz [TB: 5.00GHz] (8C/16T)
GPU            #0: NVIDIA GeForce RTX 3080 (8704 cores, 1800 MHz)[PCIe4x16][556.12]
NVENC / CUDA   NVENC API 12.2, CUDA 12.5, schedule mode: auto
Input Buffers  CUDA, 44 frames
Input Info     avsw: h264(yv12)->nv12 [AVX2], 720x540, 30000/1001 fps
Vpp Filters    copyHtoD
               cspconv(nv12 -> yv12)
               resize: ngx-vsr 720x540 -> 1920x1440
                           ngx-vsr: colorspace: cspconv(yv12 -> yuv444)
                                       matrix:smpte170m->GBR
                                    cspconv(rgb(fp32) -> rgb32)
                                    nvsdk-ngx vsr: quality: 4
                                    cspconv(rgb32 -> rgb(fp32))
                                    colorspace: matrix:GBR->smpte170m
               cspconv(yuv444(16bit) -> nv12)
Output Info    H.265/HEVC main @ Level auto
               1920x1440p 0:0 29.970fps (30000/1001fps)
               avwriter: hevc, aac => matroska
Encoder Preset quality
Rate Control   VBR
Multipass      none
Bitrate        0 kbps (Max: 96000 kbps)
Target Quality 28.00
QP range       I:0-51  P:0-51  B:0-51
QP Offset      cb:0  cr:0
VBV buf size   auto
Split Enc Mode auto
Lookahead      on, 32 frames, Level 0, Adaptive I, B Insert
GOP length     300 frames
B frames       3 frames [ref mode: middle]
Ref frames     3 frames, MultiRef L0:auto L1:auto
AQ             on (temporal, strength auto)
CU max / min   auto / auto
Others         mv:auto

The quality of the NGX VSR AI Upscaler is amazing!

rigaya commented 3 months ago

Thank you for confirming, fix will be applied in NVEnc 7.57 which is now on build.