Closed heckflosse closed 5 years ago
@Desmis My measures (median of 7) on my 8 core AMD:
transit_shapedetect: 5820 ms Lab_Local (includes transit_shapedetect): 10615 ms
With new opts (which I will post tomorrow) I get the following:
transit_shapedetect: 2660 ms Lab_Local (includes transit_shapedetect): 6100 ms
@Desmis In case you want to try before I create a pr, here's the patch:
diff --git a/rtengine/iplocallab.cc b/rtengine/iplocallab.cc
index 2be0d0ca6..8c2a24094 100644
--- a/rtengine/iplocallab.cc
+++ b/rtengine/iplocallab.cc
@@ -2852,7 +2852,7 @@ static void blendmask(const local_params& lp, int begx, int begy, int cx, int cy
#pragma omp parallel for schedule(dynamic,16)
#endif
- for (int y = 0; y < transformed->H ; y++) //{
+ for (int y = 0; y < transformed->H ; y++) {
for (int x = 0; x < transformed->W; x++) {
int lox = cx + x;
int loy = cy + y;
@@ -2900,14 +2900,11 @@ static void blendmask(const local_params& lp, int begx, int begy, int cx, int cy
original->b[y][x] = bufexporig->b[loy - begy][lox - begx];
}
-
}
}
-
-
}
}
-
+ }
}
static void showmask(const local_params& lp, int begx, int begy, int cx, int cy, int xEn, int yEn, LabImage* bufexporig, LabImage* transformed, LabImage* bufmaskorigSH)
@@ -7862,18 +7859,14 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
LabImage *bufmaskblurcol = nullptr;
LabImage *originalmaskcol = nullptr;
- int bfh = 0.f, bfw = 0.f;
- bfh = int (lp.ly + lp.lyT) + del; //bfw bfh real size of square zone
- bfw = int (lp.lx + lp.lxL) + del;
- array2D<float> buflight(bfw, bfh);
- JaggedArray<float> bufchro(bfw, bfh);
- JaggedArray<float> buflightslid(bfw, bfh);
- JaggedArray<float> bufchroslid(bfw, bfh);
- JaggedArray<float> bufhh(bfw, bfh);
+ const int bfh = int (lp.ly + lp.lyT) + del; //bfw bfh real size of square zone
+ const int bfw = int (lp.lx + lp.lxL) + del;
+ array2D<float> buflight(bfw, bfh, true);
+ JaggedArray<float> bufchro(bfw, bfh, true);
+ JaggedArray<float> bufhh(bfw, bfh, true);
JaggedArray<float> blend2(bfw, bfh);
- JaggedArray<float> buforigchro(bfw, bfh);
- JaggedArray<float> buf_a(bfw, bfh);
- JaggedArray<float> buf_b(bfw, bfh);
+ JaggedArray<float> buf_a(bfw, bfh, true);
+ JaggedArray<float> buf_b(bfw, bfh, true);
float adjustr = 1.0f;
float meansob = 0.f;
@@ -7896,48 +7889,20 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
}
if (call <= 3) { //simpleprocess, dcrop, improccoordinator
- bufcolorig = new LabImage(bfw, bfh);
+ bufcolorig = new LabImage(bfw, bfh, true);
if (lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5) {
- bufmaskorigcol = new LabImage(bfw, bfh);
- bufmaskblurcol = new LabImage(bfw, bfh);
- int GWm = transformed->W;
- int GHm = transformed->H;
+ bufmaskblurcol = new LabImage(bfw, bfh, true);
+ bufmaskorigcol = bufmaskblurcol;
+ const int GWm = transformed->W;
+ const int GHm = transformed->H;
originalmaskcol = new LabImage(GWm, GHm);
}
-#ifdef _OPENMP
- #pragma omp parallel for
-#endif
-
- for (int ir = 0; ir < bfh; ir++) //fill with 0
- for (int jr = 0; jr < bfw; jr++) {
- bufcolorig->L[ir][jr] = 0.f;
- bufcolorig->a[ir][jr] = 0.f;
- bufcolorig->b[ir][jr] = 0.f;
-
- if (lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5) {
- bufmaskorigcol->L[ir][jr] = 0.f;
- bufmaskorigcol->a[ir][jr] = 0.f;
- bufmaskorigcol->b[ir][jr] = 0.f;
- bufmaskblurcol->L[ir][jr] = 0.f;
- bufmaskblurcol->a[ir][jr] = 0.f;
- bufmaskblurcol->b[ir][jr] = 0.f;
- }
-
- bufchro[ir][jr] = 0.f;
- buf_a[ir][jr] = 0.f;
- buf_b[ir][jr] = 0.f;
- bufchroslid[ir][jr] = 0.f;
- buflightslid[ir][jr] = 0.f;
- buflight[ir][jr] = 0.f;
- bufhh[ir][jr] = 0.f;
- }
-
- int begy = lp.yc - lp.lyT;
- int begx = lp.xc - lp.lxL;
- int yEn = lp.yc + lp.ly;
- int xEn = lp.xc + lp.lx;
+ const int begy = lp.yc - lp.lyT;
+ const int begx = lp.xc - lp.lxL;
+ const int yEn = lp.yc + lp.ly;
+ const int xEn = lp.xc + lp.lx;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,16)
#endif
@@ -8046,99 +8011,101 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
lutTonemaskexp[i] = CLIP(x * 65535.); // CLIP avoid in some case extra values
}
-// gamma_mask(lutTonemask, pwr, gamm, ts, gamm2);
gammamaskexp = &lutTonemaskexp;
+ if (lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5) {
+
#ifdef _OPENMP
- #pragma omp parallel for schedule(dynamic,16)
+ #pragma omp parallel for schedule(dynamic,16)
#endif
- for (int y = 0; y < transformed->H ; y++) //{
- for (int x = 0; x < transformed->W; x++) {
- int lox = cx + x;
- int loy = cy + y;
+ for (int y = 0; y < transformed->H ; y++) {
+ for (int x = 0; x < transformed->W; x++) {
+ int lox = cx + x;
+ int loy = cy + y;
- if (lox >= begx && lox < xEn && loy >= begy && loy < yEn) {
- if (lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5) {
+ if (lox >= begx && lox < xEn && loy >= begy && loy < yEn) {
bufmaskorigcol->L[loy - begy][lox - begx] = original->L[y][x];
bufmaskorigcol->a[loy - begy][lox - begx] = original->a[y][x];
bufmaskorigcol->b[loy - begy][lox - begx] = original->b[y][x];
- bufmaskblurcol->L[loy - begy][lox - begx] = original->L[y][x];
- bufmaskblurcol->a[loy - begy][lox - begx] = original->a[y][x];
- bufmaskblurcol->b[loy - begy][lox - begx] = original->b[y][x];
}
-
- bufcolorig->L[loy - begy][lox - begx] = original->L[y][x];
}
}
+// bufmaskblurcol->CopyFrom(bufmaskorigcol);
+ }
+
+StopWatch Stopl("Loop"); // 447
+ if (lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5) {
#ifdef _OPENMP
- #pragma omp parallel for schedule(dynamic,16)
+ #pragma omp parallel
+#endif
+ {
+#ifdef __SSE2__
+ float atan2Buffer[bfw] ALIGNED64;
+#endif
+#ifdef _OPENMP
+ #pragma omp for schedule(dynamic, 16)
#endif
- for (int ir = 0; ir < bfh; ir++)
- for (int jr = 0; jr < bfw; jr++) {
-
- float valLL = 0.f;
- float valCC = 0.f;
- float valHH = 0.f;
- float kmaskL = 0;
- float kmaskCa = 0;
- float kmaskCb = 0;
-
- float kmaskHL = 0;
- float kmaskHa = 0;
- float kmaskHb = 0;
-
- if (lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5) {
-
- if (locllmasCurve && llmasutili) {
- float ligh = (bufcolorig->L[ir][jr]) / 32768.f;
- valLL = (float)(locllmasCurve[500.f * ligh]);
- valLL = LIM01(1.f - valLL);
- kmaskL = 32768.f * valLL;
- }
-
- if (lp.showmaskcolmet != 5) {
- if (locccmasCurve && lcmasutili) {
- float chromask = 0.0001f + (sqrt(SQR(bufcolorig->a[ir][jr] / fab) + SQR(bufcolorig->b[ir][jr] / fab)));
- float chromaskr = chromask;// / 45000.f;
- valCC = float (locccmasCurve[500.f * chromaskr]);
- valCC = LIM01(1.f - valCC);
- kmaskCa = valCC;
- kmaskCb = valCC;
+ for (int ir = 0; ir < bfh; ir++) {
+#ifdef __SSE2__
+ if (lochhmasCurve && lhmasutili) {
+ int i = 0;
+ for (; i < bfw - 3; i += 4) {
+ STVF(atan2Buffer[i], xatan2f(LVFU(bufcolorig->b[ir][i]), LVFU(bufcolorig->a[ir][i])));
+ }
+ for (; i < bfw; i++) {
+ atan2Buffer[i] = xatan2f(bufcolorig->b[ir][i], bufcolorig->a[ir][i]);
}
}
+#endif
+ for (int jr = 0; jr < bfw; jr++) {
+ float kmaskL = 0.f;
+ float kmaskC = 0.f;
+ float kmaskHL = 0.f;
+ float kmaskH = 0.f;
- if (lochhmasCurve && lhmasutili) {
- float huema = xatan2f(bufcolorig->b[ir][jr], bufcolorig->a[ir][jr]);
- float h = Color::huelab_to_huehsv2(huema);
- h += 1.f / 6.f;
+ if (locllmasCurve && llmasutili) {
+ kmaskL = 32768.f * LIM01(1.f - locllmasCurve[(500.f / 32768.f) * bufcolorig->L[ir][jr]]);
+ }
- if (h > 1.f) {
- h -= 1.f;
+ if (lp.showmaskcolmet != 5 && locccmasCurve && lcmasutili) {
+ kmaskC = LIM01(1.f - locccmasCurve[500.f * (0.0001f + sqrt(SQR(bufcolorig->a[ir][jr]) + SQR(bufcolorig->b[ir][jr])) / fab)]);
}
- valHH = float (lochhmasCurve[500.f * h]);
- valHH = LIM01(1.f - valHH);
+ if (lochhmasCurve && lhmasutili) {
+#ifdef __SSE2__
+ const float huema = atan2Buffer[jr];
+#else
+ const float huema = xatan2f(bufcolorig->b[ir][jr], bufcolorig->a[ir][jr]);
+#endif
+ float h = Color::huelab_to_huehsv2(huema);
+ h += 1.f / 6.f;
- if (lp.showmaskcolmet != 5) {
- kmaskHa = valHH;
- kmaskHb = valHH;
+ if (h > 1.f) {
+ h -= 1.f;
+ }
+
+ const float valHH = LIM01(1.f - lochhmasCurve[500.f * h]);
+
+ if (lp.showmaskcolmet != 5) {
+ kmaskH = valHH;
+ }
+
+ kmaskHL = 32768.f * valHH;
}
- kmaskHL = 32768.f * valHH;
+ bufmaskblurcol->L[ir][jr] = CLIPLOC(kmaskL + kmaskHL);
+ bufmaskblurcol->a[ir][jr] = CLIPC(kmaskC + kmaskH);
+ bufmaskblurcol->b[ir][jr] = CLIPC(kmaskC + kmaskH);
+ ble[ir][jr] = bufmaskblurcol->L[ir][jr] / 32768.f;
+ guid[ir][jr] = bufcolorig->L[ir][jr] / 32768.f;
}
-
- bufmaskblurcol->L[ir][jr] = CLIPLOC(kmaskL + kmaskHL);
- bufmaskblurcol->a[ir][jr] = CLIPC(kmaskCa + kmaskHa);
- bufmaskblurcol->b[ir][jr] = CLIPC(kmaskCb + kmaskHb);
- ble[ir][jr] = bufmaskblurcol->L[ir][jr] / 32768.f;
- guid[ir][jr] = bufcolorig->L[ir][jr] / 32768.f;
}
-
}
-
+ }
+Stopl.stop();
if ((lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5)) {
if (lp.radmacol > 0.f) {
guidedFilter(guid, ble, ble, lp.radmacol * 10.f / sk, 0.001, multiThread, 4);
@@ -8160,6 +8127,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
float radiusb = 1.f / sk;
if (lp.showmaskcolmet == 2 || lp.enaColorMask || lp.showmaskcolmet == 3 || lp.showmaskcolmet == 5) {
+ StopWatch Stop1("gauss");
#ifdef _OPENMP
#pragma omp parallel
#endif
@@ -8168,9 +8136,10 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
gaussianBlur(bufmaskblurcol->a, bufmaskorigcol->a, bfw, bfh, 1.f + (0.5f * lp.radmacol) / sk);
gaussianBlur(bufmaskblurcol->b, bufmaskorigcol->b, bfw, bfh, 1.f + (0.5f * lp.radmacol) / sk);
}
- delete bufmaskblurcol;
+// delete bufmaskblurcol;
if (lp.showmaskcolmet == 0 || lp.showmaskcolmet == 1 || lp.showmaskcolmet == 2 || lp.showmaskcolmet == 4 || lp.showmaskcolmet == 5 || lp.enaColorMask) {
+ originalmaskcol->CopyFrom(transformed);
blendmask(lp, begx, begy, cx, cy, xEn, yEn, bufcolorig, transformed, original, bufmaskorigcol, originalmaskcol, lp.blendmacol);
delete bufmaskorigcol;
@@ -8178,7 +8147,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
} else if (lp.showmaskcolmet == 3) {
showmask(lp, begx, begy, cx, cy, xEn, yEn, bufcolorig, transformed, bufmaskorigcol);
- delete bufmaskorigcol;
+// delete bufmaskorigcol;
delete bufcolorig;
return;
@heckflosse On this morning, I discovered your patch...and I test, with NEF and pp3
Before patch transit_shapedetect: 3200 ms Lab_Local (includes transit_shapedetect): 6300 ms
After patch transit_shapedetect: 2150 ms Lab_Local (includes transit_shapedetect): 4600 ms About 33% speedup
No differences on TIFF
very good work:)
jacques
@Desmis Jacques, I created 'newlocallab-speedup2' branch which includes further improvements and cleanups.
Processing time now is: transit_shapedetect: 2530 ms Lab_Local (includes transit_shapedetect): 5350 ms
@heckflosse Ingo OK, I compile and test transit_shapedetect: 2080 ms Lab_Local (includes transit_shapedetect): 4500 ms
@Desmis Jacques, can you also please test before/after and check for differences using this pp3? test.pp3.txt
@heckflosse I test with "test.pp3" same difference in speed-up between whithout patch and with patch... about 35%
No differences in TIFF, with layers
it's fun to see milestone appear and disappear
jacques
@Desmis Jacques, setting the milestone was by accident. It will take me a lot of work to cleanup and speedup the stuff. But I'm working hard on it :)
@Desmis Jacques, one of the harder parts will be to make serious speedups for the case you zoom 100% into an image which has a large locallab spot. This is really slow because currently the size of the spot determines the workload in this case, not the size of the viewing area.
Edit: For example panning in preview at 100% with the test.pp3
needs 2 seconds for each pan.
Edit2: For the same reason, opening detail windows is still slow in this case (2 seconds to open a detail window or move the area it shows)
@Desmis Jacques, to show you what's possible here's a mockup. The patch is for newlocallab-speedup2 branch. Currently it works only for the vibrance tool in locallab. Using other tools it will crash because I made the changes only for the vibrance tool. To test: Open the NEF from above and apply the pp3 attached to this post. Then open a detail window.
Lab_Local processing time before: ~1000 ms Lab_Local processing time after: ~5 ms
Edit: atm this is not optimized for full processing in queue/saveas...
diff --git a/rtengine/iplocallab.cc b/rtengine/iplocallab.cc
index 6c5ac4c0c..88d8eea23 100644
--- a/rtengine/iplocallab.cc
+++ b/rtengine/iplocallab.cc
@@ -3566,8 +3566,8 @@ void ImProcFunctions::transit_shapedetect(int senstype, LabImage * bufexporig, L
const float hhro = HHutili ? bufhh[loy - begy][lox - begx] : 0.f;
- const float cli = buflight[loy - begy][lox - begx];
- const float clc = (previewcol || previewexp || previewSH) ? settings->previewselection * 100.f : bufchro[loy - begy][lox - begx];
+ const float cli = buflight[y][x];
+ const float clc = (previewcol || previewexp || previewSH) ? settings->previewselection * 100.f : bufchro[y][x];
if (senstype <= 1) {
@@ -3607,7 +3607,7 @@ void ImProcFunctions::transit_shapedetect(int senstype, LabImage * bufexporig, L
float newhr = 0.f;
if (senstype == 4 || senstype == 6 || senstype == 2 || senstype == 3 || senstype == 8) {//all except color and light (TODO) and exposure
- float lightc = bufexporig->L[loy - begy][lox - begx];
+ float lightc = bufexporig->L[y][x];
float fli = ((100.f + realstrdE) / 100.f);
float diflc = lightc * fli - original->L[y][x];
diflc *= factorx;
@@ -3629,7 +3629,7 @@ void ImProcFunctions::transit_shapedetect(int senstype, LabImage * bufexporig, L
}
if (senstype == 7) {
- float difab = bufexporig->L[loy - begy][lox - begx] - sqrt(SQR(original->a[y][x]) + SQR(original->b[y][x]));
+ float difab = bufexporig->L[y][x] - sqrt(SQR(original->a[y][x]) + SQR(original->b[y][x]));
float difa = difab * cos(rhue);
float difb = difab * sin(rhue);
difa *= factorx * (100.f + realstrchdE) / 100.f;
@@ -3641,8 +3641,8 @@ void ImProcFunctions::transit_shapedetect(int senstype, LabImage * bufexporig, L
float tempb;
float flia = 1.f;
float flib = 1.f;
- const float chra = bufexporig->a[loy - begy][lox - begx];
- const float chrb = bufexporig->b[loy - begy][lox - begx];
+ const float chra = bufexporig->a[y][x];
+ const float chrb = bufexporig->b[y][x];
if (senstype == 4 || senstype == 6 || senstype == 2 || senstype == 3 || senstype == 8 || senstype == 9) {
flia = flib = ((100.f + realstrchdE) / 100.f);
@@ -3715,7 +3715,7 @@ void ImProcFunctions::transit_shapedetect(int senstype, LabImage * bufexporig, L
float newhr = 0.f;
if (senstype == 4 || senstype == 6 || senstype == 2 || senstype == 3 || senstype == 8) { //retinex & cbdl
- float lightc = bufexporig->L[loy - begy][lox - begx];
+ float lightc = bufexporig->L[y][x];
float fli = ((100.f + realstrdE) / 100.f);
float diflc = lightc * fli - original->L[y][x];
transformed->L[y][x] = CLIP(original->L[y][x] + diflc);
@@ -3736,7 +3736,7 @@ void ImProcFunctions::transit_shapedetect(int senstype, LabImage * bufexporig, L
}
if (senstype == 7) {//cbdl chroma
- float difab = bufexporig->L[loy - begy][lox - begx] - sqrt(SQR(original->a[y][x]) + SQR(original->b[y][x]));
+ float difab = bufexporig->L[y][x] - sqrt(SQR(original->a[y][x]) + SQR(original->b[y][x]));
float difa = difab * cos(rhue);
float difb = difab * sin(rhue);
difa *= (100.f + realstrchdE) / 100.f;
@@ -3748,8 +3748,8 @@ void ImProcFunctions::transit_shapedetect(int senstype, LabImage * bufexporig, L
float tempb;
float flia = 1.f;
float flib = 1.f;
- const float chra = bufexporig->a[loy - begy][lox - begx];
- const float chrb = bufexporig->b[loy - begy][lox - begx];
+ const float chra = bufexporig->a[y][x];
+ const float chrb = bufexporig->b[y][x];
if (senstype == 4 || senstype == 6 || senstype == 2 || senstype == 3 || senstype == 8 || senstype == 9) {
flia = flib = (100.f + realstrchdE) / 100.f;
@@ -6141,85 +6141,56 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
LabImage *bufexporig = nullptr;
LabImage *bufexpfin = nullptr;
- int bfh = int (lp.ly + lp.lyT) + del; //bfw bfh real size of square zone
- int bfw = int (lp.lx + lp.lxL) + del;
+ const int bfh = int (lp.ly + lp.lyT) + del; //bfw bfh real size of square zone
+ const int bfw = int (lp.lx + lp.lxL) + del;
- JaggedArray<float> buflight(bfw, bfh);
- JaggedArray<float> bufl_ab(bfw, bfh);
+ JaggedArray<float> buflight(transformed->W, transformed->H, true);
+ JaggedArray<float> bufl_ab(transformed->W, transformed->H, true);
if (call <= 3) { //simpleprocess, dcrop, improccoordinator
- bufexporig = new LabImage(bfw, bfh); //buffer for data in zone limit
- bufexpfin = new LabImage(bfw, bfh); //buffer for data in zone limit
+ bufexporig = new LabImage(transformed->W, transformed->H, true); //buffer for data in zone limit
+ bufexpfin = new LabImage(transformed->W, transformed->H, true); //buffer for data in zone limit
-
-#ifdef _OPENMP
- #pragma omp parallel for
-#endif
-
- for (int ir = 0; ir < bfh; ir++) //fill with 0
- for (int jr = 0; jr < bfw; jr++) {
- bufexporig->L[ir][jr] = 0.f;
- bufexporig->a[ir][jr] = 0.f;
- bufexporig->b[ir][jr] = 0.f;
- bufexpfin->L[ir][jr] = 0.f;
- bufexpfin->a[ir][jr] = 0.f;
- bufexpfin->b[ir][jr] = 0.f;
- buflight[ir][jr] = 0.f;
- bufl_ab[ir][jr] = 0.f;
-
-
- }
-
- int begy = lp.yc - lp.lyT;
- int begx = lp.xc - lp.lxL;
- int yEn = lp.yc + lp.ly;
- int xEn = lp.xc + lp.lx;
+ const int begy = lp.yc - lp.lyT;
+ const int begx = lp.xc - lp.lxL;
+ const int yEn = lp.yc + lp.ly;
+ const int xEn = lp.xc + lp.lx;
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,16)
#endif
- for (int y = 0; y < transformed->H ; y++) //{
+ for (int y = 0; y < transformed->H; y++) //{
for (int x = 0; x < transformed->W; x++) {
int lox = cx + x;
int loy = cy + y;
if (lox >= begx && lox < xEn && loy >= begy && loy < yEn) {
- bufexporig->L[loy - begy][lox - begx] = original->L[y][x];
- bufexporig->a[loy - begy][lox - begx] = original->a[y][x];
- bufexporig->b[loy - begy][lox - begx] = original->b[y][x];
+ bufexporig->L[y][x] = original->L[y][x];
+ bufexporig->a[y][x] = original->a[y][x];
+ bufexporig->b[y][x] = original->b[y][x];
}
}
-
-
- ImProcFunctions::vibrancelocal(sp, bfw, bfh, bufexporig, bufexpfin, localskutili, sklocalcurve);
-
-
+ ImProcFunctions::vibrancelocal(sp, transformed->W, transformed->H, bufexporig, bufexpfin, localskutili, sklocalcurve);
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic,16)
#endif
- for (int ir = 0; ir < bfh; ir++)
- for (int jr = 0; jr < bfw; jr++) {
-
- float rL;
- rL = CLIPRET((bufexpfin->L[ir][jr] - bufexporig->L[ir][jr]) / 328.f);
-
- buflight[ir][jr] = rL;
-
-
- float chp;
- chp = CLIPRET((sqrt(SQR(bufexpfin->a[ir][jr]) + SQR(bufexpfin->b[ir][jr])) - sqrt(SQR(bufexporig->a[ir][jr]) + SQR(bufexporig->b[ir][jr]))) / 250.f);
-
- bufl_ab[ir][jr] = chp;
+ for (int y = 0; y < transformed->H; y++) //{
+ for (int x = 0; x < transformed->W; x++) {
+ int lox = cx + x;
+ int loy = cy + y;
- // }
+ if (lox >= begx && lox < xEn && loy >= begy && loy < yEn) {
+ buflight[y][x] = CLIPRET((bufexpfin->L[y][x] - bufexporig->L[y][x]) / 328.f);
+ bufl_ab[y][x] = CLIPRET((sqrt(SQR(bufexpfin->a[y][x]) + SQR(bufexpfin->b[y][x])) - sqrt(SQR(bufexporig->a[y][x]) + SQR(bufexporig->b[y][x]))) / 250.f);
+ }
}
transit_shapedetect(2, bufexporig, nullptr, buflight, bufl_ab, nullptr, nullptr, nullptr, false, hueref, chromaref, lumaref, sobelref, 0.f, nullptr, lp, original, transformed, cx, cy, sk);
@@ -6345,10 +6316,10 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
if ((lp.mulloc[0] != 1.f || lp.mulloc[1] != 1.f || lp.mulloc[2] != 1.f || lp.mulloc[3] != 1.f || lp.mulloc[4] != 1.f) && lp.cbdlena) {
int bfh = int (lp.ly + lp.lyT) + del; //bfw bfh real size of square zone
int bfw = int (lp.lx + lp.lxL) + del;
- array2D<float> buflight(bfw, bfh);
- JaggedArray<float> bufchrom(bfw, bfh);
- JaggedArray<float> bufchr(bfw, bfh);
- JaggedArray<float> bufsh(bfw, bfh);
+ array2D<float> buflight(bfw, bfh, true);
+ JaggedArray<float> bufchrom(bfw, bfh, true);
+ JaggedArray<float> bufchr(bfw, bfh, true);
+ JaggedArray<float> bufsh(bfw, bfh, true);
LabImage *loctemp = nullptr;
LabImage *loctempch = nullptr;
LabImage *origcbdl = nullptr;
@@ -6364,31 +6335,9 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
if (call <= 3) { //call from simpleprocess dcrop improcc
- loctemp = new LabImage(bfw, bfh);
- loctempch = new LabImage(bfw, bfh);
- origcbdl = new LabImage(bfw, bfh);
-
-#ifdef _OPENMP
- #pragma omp parallel for
-#endif
-
- for (int ir = 0; ir < bfh; ir++) //fill with 0
- for (int jr = 0; jr < bfw; jr++) {
- bufsh[ir][jr] = 0.f;
- buflight[ir][jr] = 0.f;
- bufchr[ir][jr] = 0.f;
- bufchrom[ir][jr] = 0.f;
- loctemp->L[ir][jr] = 0.f;
- loctemp->a[ir][jr] = 0.f;
- loctemp->b[ir][jr] = 0.f;
- loctempch->L[ir][jr] = 0.f;
- loctempch->a[ir][jr] = 0.f;
- loctempch->b[ir][jr] = 0.f;
- origcbdl->L[ir][jr] = 0.f;
- origcbdl->a[ir][jr] = 0.f;
- origcbdl->b[ir][jr] = 0.f;
- }
-
+ loctemp = new LabImage(bfw, bfh, true);
+ loctempch = new LabImage(bfw, bfh, true);
+ origcbdl = new LabImage(bfw, bfh, true);
int begy = lp.yc - lp.lyT;
int begx = lp.xc - lp.lxL;
@heckflosse How do you apply this patch ? With which command git ?
I do git apply in.patch
I build in.patch by copying the patch in my editor : notepad++
And I have this message $ git apply in.patch error: le patch a échoué : rtengine/iplocallab.cc:3566 error: rtengine/iplocallab.cc : le patch ne s'applique pas
@Desmis Jacques, try this one on clean newlocallab-speedup2 branch locallab.patch.txt
git apply locallab.patch.txt
@Desmis Jacques, this one is even better as its now also optimized for full processing. locallab2.patch.txt
@Desmis last one has a bug...
@Desmis Bug fixed locallab3.patch.txt
@heckflosse Thank you I am compiling
@heckflosse I compile, run fine I clean cache I load "Nikon- D850 - 14 bit ......NEF I load pp3 file "locallab_vibrance_mockup.pp3"
If I look to the result in spped-up Time for locallalb about 20ms transit_shapedetect 10 ms
I see small changes in image (when I apply or not "vibrance") jacques
@heckflosse But how to do, to get a "good" patch ?
@Desmis Jacques, I will do the same for all Locallab tools during this week. I just used the vibrance tool because it was the easiest tool to make a mockup.
I just checked for differences in tif. Found none.
@Desmis Jacques, I would like to merge the current state of the newlocallab-speedup2 branch (without the mockups) into newlocallab to have a clean starting point for my work. Any objections?
@heckflosse No problem at all :) jacques
@Desmis merged
@Desmis Where do I find tonemapping in locallab gui?
I want to test and optimize this part in next step. https://github.com/Beep6581/RawTherapee/blob/newlocallab/rtengine/iplocallab.cc#L6240
@heckflosse You just need to enable the line 769 in locallab.cc panel->pack_start(*exptonemap, false, false);
I have desabled because, I was told a few months ago that it was useless in "local"; what I dispute.
jacques
@heckflosse Ingo I will be away from Friday 29 to tuesday 2 april
Thank you for this good work :)
jacques
@Desmis Jacques, here are a patch and a pp3 to test locallab tonemapping. Patch is to be applied on clean newlocallab-speedup2 or newlocallab branch
I got the following results:
before after
Open image: 260 ms 200 ms
Zoom to 100%: 7860 ms 380 ms
Open detail window: 7880 ms 20 ms
Full processing: 8850 ms 7050 ms
@heckflosse Ingo I test this morning 1) before patch Open image 140 ms Zoom 100% 6100 ms Full processing 6500 ms
2) after patch Oen image 110 ms Zoom 100% 240 ms Full processing 5500 ms
By cons there are differences (small) between TIF, in he concerned ara - about 1 to 2 values in L (range 0..100) Impressive Good work
jacques
@Desmis Jacques, I will check where the differences come from.
@Desmis Jacques, about the differences. The old version tonemapped an area of size (w + del) (h + del), where del == 3. The 3 pixel borders were filled with zero. The new version tonemappes an area of size w h without having zero filled borders which should be more correct for a global operator as tonemapping is.
@Desmis Jacques, this patch also includes the improvements for cbdl and softlight. locallab_softlight.patch.txt
No differences in tif
@heckflosse I come back from walking :) I tested the last patch - locallab_softlight.patch.txt
"Same" improvment for "Soflight" and "CDBL" than "vibrance" and "TM" for speed-up, especially with zoom 100%.... No differences in TIF
For TM I think you are true, the border pixels may change a little TM behavior
Very good job :)
Thank you jacques
@Desmis Jacques, the attached patch includes a cleanup. I deduplicated the vibrance code. newlocallab_dedup_vibrance_code.patch.txt
@heckflosse Ingo I just tested, works fine, no differences in TIF Same good improvment "speed-up"
Very good code optimisation by simplification :)
It will be my last contribution....since tuesday
Thank you
jacques
@Desmis Enjoy Paris :+1:
@Desmis Jacques, I will try to impove all the locallab tools before you come back from Paris. Though I can not promise, as it's a lot of work...
@Desmis Jacques, This patch includes the improvements for locallab exposure and colour&light
Next one will be locallab Shadows/Highlights
For information: The speeupds in this topic are only for normal locallab spots. For excluding spots I will open a new topic.
@Desmis Jacques, I pushed my improvements to newlocallab-speedup2.
The following tools in normal
mode got speedups:
Exposure/Sh&HL/Color&Light/softlight/tonemapping/vibrance/cbdl
@heckflosse I compile, no problem I run
*same behaviour with "mask" and Color and Light
jacques
@Desmis Jacques, I will have a look
@heckflosse Ingo, thank you Jacques
@Desmis Jacques, can you provide a pp3 which leads to crash?
@heckflosse It seems there is also a problem with "mask" (only Exposure) in branch "newlocallab"
If I put the same curve for L(L) in Color and light, Exposure, Shadows highlight, the mask in "Exposure" is always blur. And I just get a crash also with "newlocallab", with mask in "Exposure"
I will try to furnish a pp3, but difficult...
jacques
@heckflosse here a pp3, with colorspace_flowers.pef
colorspace_flowers.pefjdc.pp3.txt
When you zoom to 100% crash
Look at mask in Exposure with and whitout C(C) The behavior in Exposure is erratic jacques
With "newlocallab", no crash, but erratic behavior for "Exposure". for example, always in "Colorspace_flowers.pef" a) set neutral b) create a new spot on the yellow leaf at right c) enable "exposure" d) increase a little "exposure compensation" +0.05, the leaf becomes darker, while it should become clearer... curious
@Desmis Jacques, I found the reason for the crash. That's an old bug caused by wrong indexing [ir][ir]
here
Looking for the exposure bug now
@Desmis The two bugs you mentioned are fixed now
@heckflosse I just tested, fixed :)
jacques
@Desmis Jacques, I would like to merge newlocallab-speedup2 into newlocllab now to get more tests. Any objections?
I'm working on further speedups for newlocallab.
@Desmis Jacques, can you please post the processing time of this raw
using the attached pp3 in queue?
Ingo
Nikon - D850 - 14bit compressed (Lossless) (3_2).NEF.pp3.txt