Closed woodrow-shen closed 1 year ago
Please try the following patch:
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..7fcd01aa 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -17,6 +17,7 @@
*
*/
#include "stress-ng.h"
+#include "core-arch.h"
#include "core-cpu-cache.h"
static const stress_help_t help[] = {
@@ -25,7 +26,63 @@ static const stress_help_t help[] = {
{ NULL, NULL, NULL }
};
-#if defined(HAVE_SHIM_MFENCE)
+#if defined(STRESS_ARCH_X86)
+
+#define HAVE_PETERSON_SET_GET
+
+static inline ALWAYS_INLINE void set_bool(volatile bool *ptr, const bool value)
+{
+ *ptr = value;
+}
+
+static inline ALWAYS_INLINE void set_int(volatile int *ptr, const int value)
+{
+ *ptr = value;
+}
+
+static inline ALWAYS_INLINE bool get_bool(volatile bool *ptr)
+{
+ return *ptr;
+}
+
+static inline ALWAYS_INLINE bool get_int(volatile int *ptr)
+{
+ return *ptr;
+}
+#elif defined(HAVE_ATOMIC_LOAD) && \
+ defined(HAVE_ATOMIC_STORE)
+
+#define HAVE_PETERSON_SET_GET
+
+static inline ALWAYS_INLINE void set_bool(volatile bool *ptr, const bool value)
+{
+ __atomic_store(ptr, &value, __ATOMIC_RELAXED);
+}
+
+static inline ALWAYS_INLINE void set_int(volatile int *ptr, const int value)
+{
+ __atomic_store(ptr, &value, __ATOMIC_RELAXED);
+}
+
+static inline ALWAYS_INLINE bool get_bool(volatile bool *ptr)
+{
+ bool value;
+
+ __atomic_load(ptr, &value, __ATOMIC_RELAXED);
+ return value;
+}
+
+static inline ALWAYS_INLINE int get_int(volatile int *ptr)
+{
+ int value;
+
+ __atomic_load(ptr, &value, __ATOMIC_RELAXED);
+ return value;
+}
+#endif
+
+#if defined(HAVE_SHIM_MFENCE) && \
+ defined(HAVE_PETERSON_SET_GET)
typedef struct {
volatile int turn;
@@ -53,10 +110,12 @@ static void stress_peterson_p0(const stress_args_t *args)
double t;
t = stress_time_now();
- peterson->m.flag[0] = true;
- peterson->m.turn = 1;
+ set_bool(&peterson->m.flag[0], true);
+ shim_mb();
+ set_int(&peterson->m.turn, 1);
+ shim_mb();
shim_mfence();
- while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
+ while (get_bool(&peterson->m.flag[1]) && (get_int(&peterson->m.turn) == 1)) {
}
/* Critical section */
@@ -64,7 +123,9 @@ static void stress_peterson_p0(const stress_args_t *args)
peterson->m.check++;
check1 = peterson->m.check;
- peterson->m.flag[0] = false;
+ shim_mb();
+ set_bool(&peterson->m.flag[0], false);
+ shim_mb();
shim_mfence();
peterson->p0.duration += stress_time_now() - t;
peterson->p0.count += 1.0;
@@ -81,8 +142,10 @@ static void stress_peterson_p1(const stress_args_t *args)
double t;
t = stress_time_now();
- peterson->m.flag[1] = true;
- peterson->m.turn = 0;
+ set_bool(&peterson->m.flag[1], true);
+ shim_mb();
+ set_int(&peterson->m.turn, 0);
+ shim_mb();
shim_mfence();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
}
@@ -93,7 +156,9 @@ static void stress_peterson_p1(const stress_args_t *args)
check1 = peterson->m.check;
inc_counter(args);
- peterson->m.flag[1] = false;
+ shim_mb();
+ set_bool(&peterson->m.flag[1], false);
+ shim_mb();
shim_mfence();
peterson->p1.duration += stress_time_now() - t;
peterson->p1.count += 1.0;
Hi @ColinIanKing, I've applied your patch and tested on our hardware, however, it still can observe the symptom after a while:
# ./stress-ng-relaxed --peterson 0 --verbose
stress-ng: debug: [695] invoked with './stress-ng-relaxed --peterson 0 --verbose' by user 0
stress-ng: debug: [695] stress-ng 0.15.06 g53251eb1af78
stress-ng: debug: [695] system: Linux 5.19.7 #1 SMP Fri Mar 31 07:17:19 UTC 2023 riscv64
stress-ng: debug: [695] RAM total: 7.8G, RAM free: 7.6G, swap free: 0.0
stress-ng: debug: [695] temporary file path: '.', filesystem type: ext2
stress-ng: debug: [695] 2 processors online, 2 processors configured
stress-ng: info: [695] defaulting to a 86400 second (1 day, 0.00 secs) run per stressor
stress-ng: info: [695] dispatching hogs: 2 peterson
stress-ng: debug: [695] cache allocate: using cache maximum level L1
stress-ng: debug: [695] cache allocate: shared cache buffer size: 32K
stress-ng: debug: [695] starting stressors
stress-ng: debug: [695] 2 stressors started
stress-ng: debug: [696] peterson: started [696] (instance 0)
stress-ng: debug: [697] peterson: started [697] (instance 1)
stress-ng: fail: [697] peterson p1: peterson mutex check failed 248229 vs 248230
stress-ng: fail: [697] peterson p1: peterson mutex check failed 257794 vs 257795
stress-ng: fail: [696] peterson p1: peterson mutex check failed 368103 vs 368104
stress-ng: fail: [697] peterson p1: peterson mutex check failed 319146 vs 319147
stress-ng: fail: [697] peterson p1: peterson mutex check failed 319125 vs 319126
stress-ng: fail: [696] peterson p1: peterson mutex check failed 377249 vs 377250
stress-ng: fail: [696] peterson p1: peterson mutex check failed 379212 vs 379213
stress-ng: fail: [697] peterson p1: peterson mutex check failed 464739 vs 464740
info: 5 failures reached, aborting stress process
stress-ng: debug: [697] peterson: exited [697] (instance 1)
abort...
stress-ng: debug: [696] peterson: exited [696] (instance 0)
stress-ng: debug: [695] process [696] terminated
stress-ng: error: [695] process [697] (peterson) terminated with an error, exit status=2 (stressor failed)
stress-ng: debug: [695] process [697] terminated
stress-ng: metrc: [695] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s CPU used per RSS Max
stress-ng: metrc: [695] (secs) (secs) (secs) (real time) (usr+sys time) instance (%) (KB)
stress-ng: metrc: [695] peterson 48897092 2061.00 4246.71 4.12 23724.93 11502.95 103.13 3372
stress-ng: metrc: [695] miscellaneous metrics:
stress-ng: metrc: [695] peterson 55622.72 nanosecs per mutex (geometic mean of 2 instances)
stress-ng: debug: [695] metrics-check: all stressor metrics validated and sane
stress-ng: info: [695] unsuccessful run completed in 2179.80s (36 mins, 19.80 secs)
From patch, I'd like to suggest to change __ATOMIC_RELAXED
to __ATOMIC_SEQ_CST
for strict barrier due to peterson only considering sequential memory ordering, so I'm trying to run this change overnight, and so far so good, at least this round works longer than above case. I will see if it can be alive tomorrow. Thanks.
Using __ATOMIC_SEQ_CST makes more sense, let's see if that helps.
It looks slightly better but there are still cases within 12 hours, maybe it happened after 3 hours (before I shut down):
# ./stress-ng --peterson 0 --verbose
stress-ng: debug: [694] invoked with './stress-ng --peterson 0 --verbose' by user 0
stress-ng: debug: [694] stress-ng 0.15.06 g53251eb1af78
stress-ng: debug: [694] system: Linux 5.19.7 #1 SMP Fri Mar 31 07:17:19 UTC 2023 riscv64
stress-ng: debug: [694] RAM total: 7.8G, RAM free: 7.6G, swap free: 0.0
stress-ng: debug: [694] temporary file path: '.', filesystem type: ext2
stress-ng: debug: [694] 2 processors online, 2 processors configured
stress-ng: info: [694] defaulting to a 86400 second (1 day, 0.00 secs) run per stressor
stress-ng: info: [694] dispatching hogs: 2 peterson
stress-ng: debug: [694] cache allocate: using cache maximum level L2
stress-ng: debug: [694] cache allocate: shared cache buffer size: 128K
stress-ng: debug: [694] starting stressors
stress-ng: debug: [694] 2 stressors started
stress-ng: debug: [695] peterson: started [695] (instance 0)
stress-ng: debug: [696] peterson: started [696] (instance 1)
stress-ng: fail: [695] peterson p1: peterson mutex check failed 5211758 vs 5211759
stress-ng: fail: [696] peterson p1: peterson mutex check failed 9818975 vs 9818976
stress-ng: fail: [696] peterson p1: peterson mutex check failed 10408027 vs 10408028
stress-ng: fail: [696] peterson p1: peterson mutex check failed 12647705 vs 12647706
...
stress-ng: debug: [696] peterson: exited [696] (instance 1)
stress-ng: debug: [695] peterson: exited [695] (instance 0)
stress-ng: debug: [694] process [695] terminated
stress-ng: debug: [694] process [696] terminated
stress-ng: metrc: [694] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s CPU used per RSS Max
stress-ng: metrc: [694] (secs) (secs) (secs) (real time) (usr+sys time) instance (%) (KB)
stress-ng: metrc: [694] peterson 970883097 44392.45 86502.16 100.95 21870.46 11210.72 97.54 3336
stress-ng: metrc: [694] miscellaneous metrics:
stress-ng: metrc: [694] peterson 61484.69 nanosecs per mutex (geometic mean of 2 instances)
stress-ng: debug: [694] metrics-check: all stressor metrics validated and sane
stress-ng: info: [694] successful run completed in 44392.54s (12 hours, 19 mins, 52.54 secs)
I think we can do one thing more strict for check1: p0:
/* Critical section */
check0 = peterson->m.check;
check1 = __atomic_add_fetch(&peterson->m.check, 1, __ATOMIC_SEQ_CST);
p1:
/* Critical section */
check0 = peterson->m.check;
check1 = __atomic_sub_fetch(&peterson->m.check, 1, __ATOMIC_SEQ_CST);
I can try this if you think it's still reasonable, or we just specify this change for arch specific only, i.e riscv. Thanks.
yes, this seems reasonable to me. let me know how it works out, I can add the #ifdefs for riscv
[edit] seem my comment about this later on, I made a stupid mistake by saying this looked reasonable. Apologies.
ok, I will add them into testing, and post the result later. :)
@ColinIanKing
After burning overnight, adding __atomic_add_fetch
and __atomic_sub_fetch
operations in critical section works perfectly in our hardware, so I think we can merge this change to the master.
# ./stress-ng --peterson 0 --verbose
stress-ng: debug: [738] invoked with './stress-ng --peterson 0 --verbose' by user 0
stress-ng: debug: [738] stress-ng 0.15.06 g53251eb1af78
stress-ng: debug: [738] system: Linux 5.19.7 #1 SMP Fri Mar 31 07:17:19 UTC 2023 riscv64
stress-ng: debug: [738] RAM total: 7.8G, RAM free: 7.6G, swap free: 0.0
stress-ng: debug: [738] temporary file path: '.', filesystem type: ext2
stress-ng: debug: [738] 2 processors online, 2 processors configured
stress-ng: info: [738] defaulting to a 86400 second (1 day, 0.00 secs) run per stressor
stress-ng: info: [738] dispatching hogs: 2 peterson
stress-ng: debug: [738] cache allocate: using cache maximum level L2
stress-ng: debug: [738] cache allocate: shared cache buffer size: 128K
stress-ng: debug: [738] starting stressors
stress-ng: debug: [738] 2 stressors started
stress-ng: debug: [739] peterson: started [739] (instance 0)
stress-ng: debug: [740] peterson: started [740] (instance 1)
abort...
stress-ng: debug: [739] peterson: exited [739] (instance 0)
stress-ng: debug: [740] peterson: exited [740] (instance 1)
stress-ng: debug: [738] process [739] terminated
stress-ng: debug: [738] process [740] terminated
stress-ng: metrc: [738] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s CPU used per RSS Max
stress-ng: metrc: [738] (secs) (secs) (secs) (real time) (usr+sys time) instance (%) (KB)
stress-ng: metrc: [738] peterson 987353049 46934.77 91047.28 13.87 21036.71 10842.75 97.01 3404
stress-ng: metrc: [738] miscellaneous metrics:
stress-ng: metrc: [738] peterson 64631.04 nanosecs per mutex (geometic mean of 2 instances)
stress-ng: debug: [738] metrics-check: all stressor metrics validated and sane
stress-ng: info: [738] successful run completed in 46934.90s (13 hours, 2 mins, 14.90 secs)
Hi, thanks for testing, just to be 100% clear, what was the final solution?
Here's the finalized patch:
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..b0783413 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -17,6 +17,7 @@
*
*/
#include "stress-ng.h"
+#include "core-arch.h"
#include "core-cpu-cache.h"
static const stress_help_t help[] = {
@@ -25,7 +26,63 @@ static const stress_help_t help[] = {
{ NULL, NULL, NULL }
};
-#if defined(HAVE_SHIM_MFENCE)
+#if defined(STRESS_ARCH_X86)
+
+#define HAVE_PETERSON_SET_GET
+
+static inline ALWAYS_INLINE void set_bool(volatile bool *ptr, const bool value)
+{
+ *ptr = value;
+}
+
+static inline ALWAYS_INLINE void set_int(volatile int *ptr, const int value)
+{
+ *ptr = value;
+}
+
+static inline ALWAYS_INLINE bool get_bool(volatile bool *ptr)
+{
+ return *ptr;
+}
+
+static inline ALWAYS_INLINE bool get_int(volatile int *ptr)
+{
+ return *ptr;
+}
+#elif defined(HAVE_ATOMIC_LOAD) && \
+ defined(HAVE_ATOMIC_STORE)
+
+#define HAVE_PETERSON_SET_GET
+
+static inline ALWAYS_INLINE void set_bool(volatile bool *ptr, const bool value)
+{
+ __atomic_store(ptr, &value, __ATOMIC_SEQ_CST);
+}
+
+static inline ALWAYS_INLINE void set_int(volatile int *ptr, const int value)
+{
+ __atomic_store(ptr, &value, __ATOMIC_SEQ_CST);
+}
+
+static inline ALWAYS_INLINE bool get_bool(volatile bool *ptr)
+{
+ bool value;
+
+ __atomic_load(ptr, &value, __ATOMIC_SEQ_CST);
+ return value;
+}
+
+static inline ALWAYS_INLINE int get_int(volatile int *ptr)
+{
+ int value;
+
+ __atomic_load(ptr, &value, __ATOMIC_SEQ_CST);
+ return value;
+}
+#endif
+
+#if defined(HAVE_SHIM_MFENCE) && \
+ defined(HAVE_PETERSON_SET_GET)
typedef struct {
volatile int turn;
@@ -53,18 +110,21 @@ static void stress_peterson_p0(const stress_args_t *args)
double t;
t = stress_time_now();
- peterson->m.flag[0] = true;
- peterson->m.turn = 1;
+ set_bool(&peterson->m.flag[0], true);
+ shim_mb();
+ set_int(&peterson->m.turn, 1);
+ shim_mb();
shim_mfence();
- while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
+ while (get_bool(&peterson->m.flag[1]) && (get_int(&peterson->m.turn) == 1)) {
}
/* Critical section */
check0 = peterson->m.check;
- peterson->m.check++;
- check1 = peterson->m.check;
+ check1 = __atomic_add_fetch(&peterson->m.check, 1, __ATOMIC_SEQ_CST);
- peterson->m.flag[0] = false;
+ shim_mb();
+ set_bool(&peterson->m.flag[0], false);
+ shim_mb();
shim_mfence();
peterson->p0.duration += stress_time_now() - t;
peterson->p0.count += 1.0;
@@ -81,19 +141,22 @@ static void stress_peterson_p1(const stress_args_t *args)
double t;
t = stress_time_now();
- peterson->m.flag[1] = true;
- peterson->m.turn = 0;
+ set_bool(&peterson->m.flag[1], true);
+ shim_mb();
+ set_int(&peterson->m.turn, 0);
+ shim_mb();
shim_mfence();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
}
/* Critical section */
check0 = peterson->m.check;
- peterson->m.check--;
- check1 = peterson->m.check;
+ check1 = __atomic_sub_fetch(&peterson->m.check, 1, __ATOMIC_SEQ_CST);
inc_counter(args);
- peterson->m.flag[1] = false;
+ shim_mb();
+ set_bool(&peterson->m.flag[1], false);
+ shim_mb();
shim_mfence();
peterson->p1.duration += stress_time_now() - t;
peterson->p1.count += 1.0;
Thanks for the clarification. Would it be possible to re-test the above patch without the shim_mb() calls (these are just memory barriers to stop the compiler for re-ordering the code). I suspect these can be removed without causing issues.
Ok, I think shim_mfence()
should be enough for this case. Will try this change later tonight.
@ColinIanKing Looks good to me:)
# ./stress-ng --peterson 0 --verbose
stress-ng: debug: [722] invoked with './stress-ng --peterson 0 --verbose' by user 0
stress-ng: debug: [722] stress-ng 0.15.06 g53251eb1af78
stress-ng: debug: [722] system: Linux 5.19.7 #1 SMP Fri Mar 31 07:17:19 UTC 2023 riscv64
stress-ng: debug: [722] RAM total: 7.8G, RAM free: 7.6G, swap free: 0.0
stress-ng: debug: [722] temporary file path: '.', filesystem type: ext2
stress-ng: debug: [722] 2 processors online, 2 processors configured
stress-ng: info: [722] defaulting to a 86400 second (1 day, 0.00 secs) run per stressor
stress-ng: info: [722] dispatching hogs: 2 peterson
stress-ng: debug: [722] cache allocate: using cache maximum level L2
stress-ng: debug: [722] cache allocate: shared cache buffer size: 128K
stress-ng: debug: [722] starting stressors
stress-ng: debug: [722] 2 stressors started
stress-ng: debug: [723] peterson: started [723] (instance 0)
stress-ng: debug: [724] peterson: started [724] (instance 1)
...
stress-ng: debug: [723] peterson: exited [723] (instance 0)
stress-ng: debug: [722] process [723] terminated
stress-ng: debug: [724] peterson: exited [724] (instance 1)
stress-ng: debug: [722] process [724] terminated
stress-ng: metrc: [722] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s CPU used per RSS Max
stress-ng: metrc: [722] (secs) (secs) (secs) (real time) (usr+sys time) instance (%) (KB)
stress-ng: metrc: [722] peterson 1030993769 47947.75 92988.52 17.40 21502.44 11085.25 96.99 3404
stress-ng: metrc: [722] miscellaneous metrics:
stress-ng: metrc: [722] peterson 63805.48 nanosecs per mutex (geometic mean of 2 instances)
stress-ng: debug: [722] metrics-check: all stressor metrics validated and sane
stress-ng: info: [722] successful run completed in 47947.80s (13 hours, 19 mins, 7.80 secs)
The final patch that was tested:
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..caaf3f76 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -17,6 +17,7 @@
*
*/
#include "stress-ng.h"
+#include "core-arch.h"
#include "core-cpu-cache.h"
static const stress_help_t help[] = {
@@ -25,7 +26,63 @@ static const stress_help_t help[] = {
{ NULL, NULL, NULL }
};
-#if defined(HAVE_SHIM_MFENCE)
+#if defined(STRESS_ARCH_X86)
+
+#define HAVE_PETERSON_SET_GET
+
+static inline ALWAYS_INLINE void set_bool(volatile bool *ptr, const bool value)
+{
+ *ptr = value;
+}
+
+static inline ALWAYS_INLINE void set_int(volatile int *ptr, const int value)
+{
+ *ptr = value;
+}
+
+static inline ALWAYS_INLINE bool get_bool(volatile bool *ptr)
+{
+ return *ptr;
+}
+
+static inline ALWAYS_INLINE bool get_int(volatile int *ptr)
+{
+ return *ptr;
+}
+#elif defined(HAVE_ATOMIC_LOAD) && \
+ defined(HAVE_ATOMIC_STORE)
+
+#define HAVE_PETERSON_SET_GET
+
+static inline ALWAYS_INLINE void set_bool(volatile bool *ptr, const bool value)
+{
+ __atomic_store(ptr, &value, __ATOMIC_SEQ_CST);
+}
+
+static inline ALWAYS_INLINE void set_int(volatile int *ptr, const int value)
+{
+ __atomic_store(ptr, &value, __ATOMIC_SEQ_CST);
+}
+
+static inline ALWAYS_INLINE bool get_bool(volatile bool *ptr)
+{
+ bool value;
+
+ __atomic_load(ptr, &value, __ATOMIC_SEQ_CST);
+ return value;
+}
+
+static inline ALWAYS_INLINE int get_int(volatile int *ptr)
+{
+ int value;
+
+ __atomic_load(ptr, &value, __ATOMIC_SEQ_CST);
+ return value;
+}
+#endif
+
+#if defined(HAVE_SHIM_MFENCE) && \
+ defined(HAVE_PETERSON_SET_GET)
typedef struct {
volatile int turn;
@@ -53,18 +110,17 @@ static void stress_peterson_p0(const stress_args_t *args)
double t;
t = stress_time_now();
- peterson->m.flag[0] = true;
- peterson->m.turn = 1;
+ set_bool(&peterson->m.flag[0], true);
+ set_int(&peterson->m.turn, 1);
shim_mfence();
- while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
+ while (get_bool(&peterson->m.flag[1]) && (get_int(&peterson->m.turn) == 1)) {
}
/* Critical section */
check0 = peterson->m.check;
- peterson->m.check++;
- check1 = peterson->m.check;
+ check1 = __atomic_add_fetch(&peterson->m.check, 1, __ATOMIC_SEQ_CST);
- peterson->m.flag[0] = false;
+ set_bool(&peterson->m.flag[0], false);
shim_mfence();
peterson->p0.duration += stress_time_now() - t;
peterson->p0.count += 1.0;
@@ -81,19 +137,18 @@ static void stress_peterson_p1(const stress_args_t *args)
double t;
t = stress_time_now();
- peterson->m.flag[1] = true;
- peterson->m.turn = 0;
+ set_bool(&peterson->m.flag[1], true);
+ set_int(&peterson->m.turn, 0);
shim_mfence();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
}
/* Critical section */
check0 = peterson->m.check;
- peterson->m.check--;
- check1 = peterson->m.check;
+ check1 = __atomic_sub_fetch(&peterson->m.check, 1, __ATOMIC_SEQ_CST);
inc_counter(args);
- peterson->m.flag[1] = false;
+ set_bool(&peterson->m.flag[1], false);
shim_mfence();
peterson->p1.duration += stress_time_now() - t;
peterson->p1.count += 1.0;
Many thanks! Much appreciated.
The idea behind the code is to create a critical section where were can alter the peterson->m.check counter because it is essentially locked. The atomic operation such as:
check1 = atomic_add_fetch(&peterson->m.check, 1, ATOMIC_SEQ_CST);
should not be needed if the code is running correctly since it is protected around the critical section. If the atomic locks around this are removed and the test fails then that implies the peterson mutex is broken. Apologies, I earlier say this looked reasonable, but after checking it again I remembered why we have this increment/decrement on the check counter - it was for locking sanity checks. Oops.
I think I need to get a SMP RISC-V board and look at this in more detail. The original code works fine on a bunch of non-x86 SMP systems (e.g. an ARM 24 core dev board I have), so I'm only assuming it may be down to read/write re-ordering.
@ColinIanKing how long period you had tested on arm board? coz it might not be easy to reproduce the symptom but it's still controversial for sure that original Peterson algorithm has many opinions about SMP.
I'm going to do a few 24+ hour runs to see.
Oh, one further question, which risc-v system are you testing this on?
we're running the stress-ng on fpga system that is indeed under development, coz we don't have the development board yet. I probably can find some resources internally to confirm if the real board has such an issue or not, but I think we should keep this until we're clear summary or you've riscv board to clarify the problem. Thanks for understanding.
It will be a few weeks before I can get a dev board for my own testing, I'll ask around if this can be exercised on other risc-v kit.
Just to let you know, I can't reproduce this on a 24 core ARM® Cortex-A53, after a 24 hour soak test.
Thanks for update! That would be useful for reference.
I asked a friend to soak test this with 12 hour and 24 hour soak test on a SMP SiFive Unmatched SoC (Quad-core 64-bit SiFive U74) - see https://sifive.cdn.prismic.io/sifive/d0556df9-55c6-47a8-b0f2-4b1521546543_hifive-unmatched-datasheet.pdf and it worked perfectly without any problems. I was wondering if this issue is an issue on the fpga processor you are using.
Meanwhile, I'm waiting for a delivery of a quad core SoC dev board that should arrive by the end of May 2023. I'm going to make a new stress-ng before the end of the month, so I will re-examine this issue for the June release of stress-ng
Hi @ColinIanKing For Unmatched board, I've also tested it without such issue, however it's in-order architecture, and our architecture under development is out-of-order based, so probably it's the main difference. I'm fine to wait for your re-examination without hurry, and you can close once the result is the same. Thanks for the efforts.
I guess with out-of-order execution we need to insert some more fencing instructions in the code. I wonder if adding shim_mfence calls around the peterson variable updates will force enough ordering to fix this, e.g.
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..3a7fe994 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -53,7 +53,9 @@ static void stress_peterson_p0(const stress_args_t *args)
double t;
t = stress_time_now();
+ shim_mfence();
peterson->m.flag[0] = true;
+ shim_mfence();
peterson->m.turn = 1;
shim_mfence();
while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
@@ -64,6 +66,7 @@ static void stress_peterson_p0(const stress_args_t *args)
peterson->m.check++;
check1 = peterson->m.check;
+ shim_mfence();
peterson->m.flag[0] = false;
shim_mfence();
peterson->p0.duration += stress_time_now() - t;
@@ -81,7 +84,9 @@ static void stress_peterson_p1(const stress_args_t *args)
double t;
t = stress_time_now();
+ shim_mfence();
peterson->m.flag[1] = true;
+ shim_mfence();
peterson->m.turn = 0;
shim_mfence();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
@@ -93,6 +98,7 @@ static void stress_peterson_p1(const stress_args_t *args)
check1 = peterson->m.check;
inc_counter(args);
+ shim_mfence();
peterson->m.flag[1] = false;
shim_mfence();
peterson->p1.duration += stress_time_now() - t;
I can give it a try later if I'm available to get the testing.
Badly, adding fence from your change still fails to check the value after few hours, so it looks insufficient for our case maybe. Btw, what kind of riscv board you purchase? I think currently there is no out-of-order riscv soc already shipped for marketing.
That's useful to know. My other thought was perhaps the compiler is doing some optimization and re-ordering on the code too, so one way of stopping this with gcc is to use a memory barrier to tell it to keep store ordering. With this and fence ops perhaps we can get this to work properly.
Perhaps you could try this:
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..9adbcef1 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -33,6 +33,12 @@ typedef struct {
volatile bool flag[2];
} peterson_mutex_t;
+#define BARRIER_FENCE() \
+do { \
+ __asm__ __volatile__("" ::: "memory"); \
+ shim_mfence(); \
+} while (0);
+
/*
* peterson_t is mmap'd shared and m, p0, p1 are 64 byte cache aligned
* to reduce cache contention when updating metrics on p0 and p1
@@ -53,19 +59,23 @@ static void stress_peterson_p0(const stress_args_t *args)
double t;
t = stress_time_now();
+ BARRIER_FENCE();
peterson->m.flag[0] = true;
+ BARRIER_FENCE();
peterson->m.turn = 1;
- shim_mfence();
+ BARRIER_FENCE();
while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
}
+ BARRIER_FENCE();
/* Critical section */
check0 = peterson->m.check;
peterson->m.check++;
check1 = peterson->m.check;
+ BARRIER_FENCE();
peterson->m.flag[0] = false;
- shim_mfence();
+ BARRIER_FENCE();
peterson->p0.duration += stress_time_now() - t;
peterson->p0.count += 1.0;
@@ -81,11 +91,14 @@ static void stress_peterson_p1(const stress_args_t *args)
double t;
t = stress_time_now();
+ BARRIER_FENCE();
peterson->m.flag[1] = true;
+ BARRIER_FENCE();
peterson->m.turn = 0;
- shim_mfence();
+ BARRIER_FENCE();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
}
+ BARRIER_FENCE();
/* Critical section */
check0 = peterson->m.check;
@@ -93,8 +106,9 @@ static void stress_peterson_p1(const stress_args_t *args)
check1 = peterson->m.check;
inc_counter(args);
+ BARRIER_FENCE();
peterson->m.flag[1] = false;
- shim_mfence();
+ BARRIER_FENCE();
peterson->p1.duration += stress_time_now() - t;
peterson->p1.count += 1.0;
Hi @ColinIanKing, I refer [1] to make a change to re-run the test, and it works perfectly on our platform over 24 hours, so here's a patch for giving the slight change:
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..31660740 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -55,8 +55,9 @@ static void stress_peterson_p0(const stress_args_t *args)
t = stress_time_now();
peterson->m.flag[0] = true;
peterson->m.turn = 1;
- shim_mfence();
+ __sync_synchronize();
while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
+ sched_yield();
}
/* Critical section */
@@ -83,8 +84,9 @@ static void stress_peterson_p1(const stress_args_t *args)
t = stress_time_now();
peterson->m.flag[1] = true;
peterson->m.turn = 0;
- shim_mfence();
+ __sync_synchronize();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
+ sched_yield();
}
/* Critical section */
From my rough perspectives, there should not be major difference between __asm__ __volatile__("fence" ::: "memory");
and _sync_synchronize();
as they're implementing full memory barrier that is also supported by riscv, so I'm not sure why they cause different result on our platform. But it seems using GCC builtin function should be more sensible for portability and standardization. I think it's acceptable for you to consider to revise this change while it won't affect other arches and probably benefit this stressor with efficiency from sched_yield
. Thanks.
Log:
stress-ng: debug: [705] invoked with './stress-ng-peterson --peterson 0 --verbose' by user 0
stress-ng: debug: [705] stress-ng 0.15.06 g53251eb1af78
stress-ng: debug: [705] system: Linux 5.19.7 #1 SMP Fri Mar 31 07:17:19 UTC 2023 riscv64
stress-ng: debug: [705] RAM total: 7.8G, RAM free: 7.6G, swap free: 0.0
stress-ng: debug: [705] temporary file path: '.', filesystem type: ext2
stress-ng: debug: [705] 2 processors online, 2 processors configured
stress-ng: info: [705] defaulting to a 86400 second (1 day, 0.00 secs) run per stressor
stress-ng: info: [705] dispatching hogs: 2 peterson
stress-ng: debug: [705] cache allocate: using cache maximum level L2
stress-ng: debug: [705] cache allocate: shared cache buffer size: 256K
stress-ng: debug: [705] starting stressors
stress-ng: debug: [705] 2 stressors started
stress-ng: debug: [706] peterson: started [706] (instance 0)
stress-ng: debug: [707] peterson: started [707] (instance 1)
stress-ng: debug: [706] peterson: exited [706] (instance 0)
stress-ng: debug: [705] process [706] terminated
stress-ng: debug: [707] peterson: exited [707] (instance 1)
stress-ng: debug: [705] process [707] terminated
stress-ng: metrc: [705] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s CPU used per RSS Mx
stress-ng: metrc: [705] (secs) (secs) (secs) (real time) (usr+sys time) instance (%) (K)
stress-ng: metrc: [705] peterson 666877480 86400.03 150848.14 10712.76 7718.49 4127.72 93.50 340
stress-ng: metrc: [705] miscellaneous metrics:
stress-ng: metrc: [705] peterson 150603.99 nanosecs per mutex (geometic mean of 2 instances)
stress-ng: debug: [705] metrics-check: all stressor metrics validated and sane
stress-ng: info: [705] successful run completed in 86400.24s (1 day, 0.24 secs)
I find that fence instruction per shim_mfence
actually produces two fence
instructions in the assembly so I'm going to run the another change with the following:
diff --git a/core-cpu-cache.h b/core-cpu-cache.h
index 5f8bb5d0..ddbc66de 100644
--- a/core-cpu-cache.h
+++ b/core-cpu-cache.h
@@ -148,6 +148,7 @@ static inline void ALWAYS_INLINE shim_mfence(void)
defined(HAVE_ASM_RISCV_FENCE) && \
!defined(HAVE_SHIM_MFENCE)
stress_asm_riscv_fence();
+#define HAVE_SHIM_MFENCE
#endif
#if defined(STRESS_ARCH_X86) && \
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..38c9969d 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -57,6 +57,7 @@ static void stress_peterson_p0(const stress_args_t *args)
peterson->m.turn = 1;
shim_mfence();
while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
+ sched_yield();
}
/* Critical section */
@@ -85,6 +86,7 @@ static void stress_peterson_p1(const stress_args_t *args)
peterson->m.turn = 0;
shim_mfence();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
+ sched_yield();
}
/* Critical section */
It could be the last one, hopefully.
OK, above patch can be successful run without any error over 18 hours so it's looking good for me so far. Can you please review and merge it if possible?
diff --git a/core-cpu-cache.h b/core-cpu-cache.h
index 5f8bb5d0..ddbc66de 100644
--- a/core-cpu-cache.h
+++ b/core-cpu-cache.h
@@ -148,6 +148,7 @@ static inline void ALWAYS_INLINE shim_mfence(void)
defined(HAVE_ASM_RISCV_FENCE) && \
!defined(HAVE_SHIM_MFENCE)
stress_asm_riscv_fence();
+#define HAVE_SHIM_MFENCE
#endif
#if defined(STRESS_ARCH_X86) && \
diff --git a/stress-peterson.c b/stress-peterson.c
index 0ef8b430..38c9969d 100644
--- a/stress-peterson.c
+++ b/stress-peterson.c
@@ -57,6 +57,7 @@ static void stress_peterson_p0(const stress_args_t *args)
peterson->m.turn = 1;
shim_mfence();
while (peterson->m.flag[1] && (peterson->m.turn == 1)) {
+ sched_yield();
}
/* Critical section */
@@ -85,6 +86,7 @@ static void stress_peterson_p1(const stress_args_t *args)
peterson->m.turn = 0;
shim_mfence();
while (peterson->m.flag[0] && (peterson->m.turn == 0)) {
+ sched_yield();
}
/* Critical section */
Then we can close this, thanks.
The core-cpu-cache.h fix landed last week:
commit 0acc6561cc7669c63ea5c26ff12f269d3459754e Author: Colin Ian King colin.i.king@gmail.com Date: Mon May 22 11:35:36 2023 +0100
core-cpu-cache.h: define HAVE_SHIM_MFENCE for risc-v case
Currently HAVE_SHIM_MFENCE is not defined, causing __sync_synchronize()
to also be called hence ending up with two fence instructions. Fix this
by defining it.
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
diff --git a/core-cpu-cache.h b/core-cpu-cache.h index 5f8bb5d0..ddbc66de 100644 --- a/core-cpu-cache.h +++ b/core-cpu-cache.h @@ -148,6 +148,7 @@ static inline void ALWAYS_INLINE shim_mfence(void) defined(HAVE_ASM_RISCV_FENCE) && \ !defined(HAVE_SHIM_MFENCE) stress_asm_riscv_fence(); +#define HAVE_SHIM_MFENCE
.. so from what I can see, the addition of the sched_yield() helps. I guess that's adding some extra latency into the spin loop and somehow avoiding the race. I'll add it to the stressor for RISC-V only, but I don't think it's necessary correct.
yeap, it's enough. Thank you!
On riscv smp hardware, we found that peterson stressor failed the check from time to time as below: Stress-ng version: 0.15.06 Hardware: riscv 2 cores How to reproduce it:
stress-ng --peterson 2 --verbose
After some surveys, I get some discussion that peterson algorithm may not work on multi-cores system[1] due to memory ordering and architecture specific. For example, [2] shares the point:
Besides, [3] and [4] mention about atomic operations should be used for lock / unlocking, and maybe it's worth to try. I'm not sure what other arches are like, so I'm asking here to raise the concern and looking for help, if we can get more data or start to improve this case.
Thanks,
[1] https://www.scaler.com/topics/petersons-solution/ [2] https://w3.cs.jmu.edu/kirkpams/OpenCSF/Books/csf/html/CritSect.html [3] https://www.cs.uic.edu/~jbell/CourseNotes/OperatingSystems/5_Synchronization.html [4] https://stackoverflow.com/questions/11588514/a-tested-implementation-of-peterson-lock-algorithm [5] https://www.eecs.yorku.ca/course_archive/2020-21/S/3221/Lect6.pdf [6] https://web.cecs.pdx.edu/~harry/riscv/RISCV-Summary.pdf