Open nicocas7ro opened 2 years ago
I get the same result with multi GPU.
I get the same result,showing the same problem
I get the same result with multi GPU.
excesuse me, is your problem solved?
I am also experiencing the same issue.
Has anyone else had this problem and figured out what to do to fix it?
After I cleaned the cache file in the dataset file and train works.
我也遇到了同样的问题。
有没有其他人遇到过这个问题,并想出了如何解决它?
直接使用python train.py 就可以了,但是后台查看,跑的时候,还是多个GPU一起的
Very strange, getting same with mine. Running on multiple GPUs with much reduced input image resolution and btach size 8.
python -m torch.distributed.launch --nproc_per_node 4 --master_port 9527 train.py --workers 2 --device 0,1,2,3 --sync-bn --batch-size 8 --data data/coco.yaml --img 320 320 --cfg cfg/training/yolov7.yaml --weights '' --name AM2yolov7 --hyp data/hyp.scratch.p5.yaml
`autoanchor: Evolving anchors with Genetic Algorithm: fitness = 0.6890: 100%|█████████▉| 995/1000 [04:29<00:01, 3.autoanchor: Evolving anchors with Genetic Algorithm: fitness = 0.6890: 100%|█████████▉| 996/1000 [04:30<00:01, 3.autoanchor: Evolving anchors with Genetic Algorithm: fitness = 0.6890: 100%|█████████▉| 997/1000 [04:30<00:00, 3.autoanchor: Evolving anchors with Genetic Algorithm: fitness = 0.6890: 100%|█████████▉| 998/1000 [04:30<00:00, 3.autoanchor: Evolving anchors with Genetic Algorithm: fitness = 0.6890: 100%|█████████▉| 999/1000 [04:30<00:00, 3.autoanchor: Evolving anchors with Genetic Algorithm: fitness = 0.6890: 100%|██████████| 1000/1000 [04:31<00:00, 3autoanchor: Evolving anchors with Genetic Algorithm: fitness = 0.6890: 100%|██████████| 1000/1000 [04:31<00:00, 3.69it/s]
Traceback (most recent call last):
File "/yolov7/train.py", line 616, in
typedef long long int int64_t; typedef unsigned int uint32_t; typedef signed char int8_t; typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char" typedef short int16_t; static_assert(sizeof(int64_t) == 8, "expected size does not match"); static_assert(sizeof(uint32_t) == 4, "expected size does not match"); static_assert(sizeof(int8_t) == 1, "expected size does not match"); constexpr int num_threads = 128; constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live constexpr int block_work_size = thread_work_size * num_threads; //TODO use _assert_fail, because assert is disabled in non-debug builds
namespace std {
using ::signbit; using ::isfinite; using ::isinf; using ::isnan;
using ::abs;
using ::acos; using ::acosf; using ::asin; using ::asinf; using ::atan; using ::atanf; using ::atan2; using ::atan2f; using ::ceil; using ::ceilf; using ::cos; using ::cosf; using ::cosh; using ::coshf;
using ::exp; using ::expf;
using ::fabs; using ::fabsf; using ::floor; using ::floorf;
using ::fmod; using ::fmodf;
using ::frexp; using ::frexpf; using ::ldexp; using ::ldexpf;
using ::log; using ::logf;
using ::log10; using ::log10f; using ::modf; using ::modff;
using ::pow; using ::powf;
using ::sin; using ::sinf; using ::sinh; using ::sinhf;
using ::sqrt; using ::sqrtf; using ::tan; using ::tanf;
using ::tanh; using ::tanhf;
using ::acosh; using ::acoshf; using ::asinh; using ::asinhf; using ::atanh; using ::atanhf; using ::cbrt; using ::cbrtf;
using ::copysign; using ::copysignf;
using ::erf; using ::erff; using ::erfc; using ::erfcf; using ::exp2; using ::exp2f; using ::expm1; using ::expm1f; using ::fdim; using ::fdimf; using ::fmaf; using ::fma; using ::fmax; using ::fmaxf; using ::fmin; using ::fminf; using ::hypot; using ::hypotf; using ::ilogb; using ::ilogbf; using ::lgamma; using ::lgammaf; using ::llrint; using ::llrintf; using ::llround; using ::llroundf; using ::log1p; using ::log1pf; using ::log2; using ::log2f; using ::logb; using ::logbf; using ::lrint; using ::lrintf; using ::lround; using ::lroundf;
using ::nan; using ::nanf;
using ::nearbyint; using ::nearbyintf; using ::nextafter; using ::nextafterf; using ::remainder; using ::remainderf; using ::remquo; using ::remquof; using ::rint; using ::rintf; using ::round; using ::roundf; using ::scalbln; using ::scalblnf; using ::scalbn; using ::scalbnf; using ::tgamma; using ::tgammaf; using ::trunc; using ::truncf;
} // namespace std
// NB: Order matters for this macro; it is relied upon in // _promoteTypesLookup and the serialization format. // Note, some types have ctype as void because we don't support them in codegen
_(uint8t, Byte) / 0 / \
(int8t, Char) / 1 / \
(int16t, Short) / 2 / \
(int, Int) / 3 / \
_(int64t, Long) / 4 / \
(at::Half, Half) / 5 / \
(float, Float) / 6 / \
(double, Double) / 7 / \
(std::complex
_(uint8t, Byte) \
(int8t, Char) \
(int16t, Short) \
(int, Int) \
_(int64t, Long) \
(at::Half, Half) \
(float, Float) \
(double, Double) \
(std::complex
enum class ScalarType : int8_t {
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
Undefined,
NumOptions };
template <typename T, int size> struct Array { T data[size];
device T operator[](int i) const { return data[i]; } device T& operator[](int i) { return data[i]; } Array() = default; Array(const Array&) = default; Array& operator=(const Array&) = default; device Array(T x) { for (int i = 0; i < size; i++) { data[i] = x; } } };
template
device DivMod(T _div, T _mod) { div = _div; mod = _mod; } };
//
device inline unsigned int div(unsigned int n) const { unsigned int t = __umulhi(n, m1); return (t + n) >> shift; }
device inline unsigned int mod(unsigned int n) const { return n - div(n) * divisor; }
device inline DivMod
unsigned int divisor; // d above. unsigned int m1; // Magic number: m' above. unsigned int shift; // Shift amounts. };
template
for (int arg = 0; arg < NARGS; arg++) {
offsets[arg] = linear_idx;
}
return offsets;
}
};
template
for (int arg = 0; arg < NARGS; ++arg) {
offsets[arg] = 0;
}
#pragma unroll
for (int dim = 0; dim < 25; ++dim) {
if (dim == dims) {
break;
}
auto divmod = sizes_[dim].divmod(linear_idx);
linear_idx = divmod.div;
#pragma unroll
for (int arg = 0; arg < NARGS; ++arg) {
offsets[arg] += divmod.mod * strides_[dim][arg];
}
//printf("offset calc thread dim size stride offset %d %d %d %d %d %d %d %d\n",
//threadIdx.x, dim, sizes_[dim].divisor, strides_[dim][0], offsets[0], linear_idx, divmod.div, divmod.mod);
}
return offsets;
}
int dims;
IntDivider sizes_[25];
// NOTE: this approach will not support nInputs == 0
unsigned int strides_[25][NARGS];
};
template
template
// aligned vector generates vectorized load/store on CUDA template<typename scalar_t, int vec_size> struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { scalar_t val[vec_size]; };
C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { // get GCD of num and denom using Euclid's algorithm. // Can replace this with std::gcd if we ever support c++17. size_t a = denominator; size_t b = numerator; while (b != 0) { a %= b; // swap(a,b) size_t tmp = a; a = b; b = tmp; }
// a is now the GCD
numerator /= a;
denominator /= a;
}
struct ReduceConfig { //has to match host-side ReduceConfig in the eager code static constexpr int BLOCK_X = 0; static constexpr int BLOCK_Y = 1; static constexpr int CTA = 2;
static constexpr int input_vec_size = 4; int element_size_bytes; int num_inputs; int num_outputs; int step_input = 1; int step_output = 1; int ctas_per_output = 1; int input_mult[3] = {0, 0, 0}; int output_mult[2] = {0, 0};
int block_width; int block_height; int num_threads;
bool vectorize_input = false; int output_vec_size = 1;
C10_HOST_DEVICE bool should_block_x_reduce() const { return input_mult[BLOCK_X] != 0; }
C10_HOST_DEVICE bool should_block_y_reduce() const { return input_mult[BLOCK_Y] != 0; }
C10_HOST_DEVICE bool should_global_reduce() const { return input_mult[CTA] != 0; }
C10_DEVICE bool should_store(int output_idx) const { return output_idx < num_outputs && (!should_block_x_reduce() || threadIdx.x == 0) && (!should_block_y_reduce() || threadIdx.y == 0); }
C10_DEVICE bool should_reduce_tail() const { return (!should_block_y_reduce() || threadIdx.y == 0) && (!should_global_reduce() || blockIdx.y == 0); }
C10_HOST_DEVICE int input_idx() const { int lane = threadIdx.x; int warp = threadIdx.y; int cta2 = blockIdx.y; return (lane input_mult[BLOCK_X] + warp input_mult[BLOCK_Y] + cta2 * input_mult[CTA]); }
template
C10_DEVICE int shared_memory_offset(int offset) const { return threadIdx.x + (threadIdx.y + offset) * blockDim.x; }
C10_DEVICE int staging_memory_offset(int cta2) const { int offset = cta2 + blockIdx.x gridDim.y; if (!should_block_x_reduce()) { offset = threadIdx.x + offset blockDim.x; } return offset; }
};
//TODO this will need to be different for more generic reduction functions namespace reducer {
using scalar_t = float; using arg_t = float; using out_scalar_t = float;
inline device arg_t combine(arg_t a, arg_t b) { return a * b; }
inline device out_scalar_t project(arg_t arg) { return (out_scalar_t) arg; }
inline device arg_t warp_shfl_down(arg_t arg, int offset) { return WARP_SHFL_DOWN(arg, offset); }
inline device arg_t translate_idx(arg_t acc, int64_t /idx/) { return acc; }
// wrap a normal reduction that ignores the index inline device arg_t reduce(arg_t acc, arg_t val, int64_t idx) { return combine(acc, val); } }
struct ReduceJitOp { using scalar_t = float; using arg_t = float; using out_scalar_t = float;
using InputCalculator = OffsetCalculator<1>; using OutputCalculator = OffsetCalculator<2>;
// static constexpr bool can_accumulate_in_output = // std::is_convertible<arg_t, out_scalar_t>::value // && std::is_convertible<out_scalar_t, arg_t>::value;
static constexpr int input_vec_size = ReduceConfig::input_vec_size;
arg_t ident; ReduceConfig config; InputCalculator input_calc; OutputCalculator output_calc; const void src; const char dst[2]; //it accepts at most two destinations // acc_buf used for accumulation among sub Tensor Iterator when accumulation on // output is not permissible void acc_buf; // cta_buf used for accumulation between blocks during global reduction void cta_buf; int* semaphores; int64_t base_idx; bool accumulate; bool final_output; int noutputs;
C10_DEVICE void run() const { extern shared char shared_memory[]; uint32_t output_idx = config.output_idx<1>(); uint32_t input_idx = config.input_idx(); auto base_offsets1 = output_calc.get(output_idx)[1];
using arg_vec_t = Array<arg_t, 1>;
arg_vec_t value;
if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
value = thread_reduce<1>(input_slice);
}
if (config.should_block_y_reduce()) {
value = block_y_reduce<1>(value, shared_memory);
}
if (config.should_block_x_reduce()) {
value = block_x_reduce<1>(value, shared_memory);
}
using out_ptr_vec_t = Array<out_scalar_t*, 1>;
using offset_vec_t = Array<uint32_t, 1>;
offset_vec_t base_offsets;
out_ptr_vec_t out;
#pragma unroll
for (int i = 0; i < 1; i++) {
base_offsets[i] = output_calc.get(output_idx + i)[0];
out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
}
arg_vec_t* acc = nullptr;
if (acc_buf != nullptr) {
size_t numerator = sizeof(arg_t);
size_t denominator = sizeof(out_scalar_t);
reduce_fraction(numerator, denominator);
acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
}
if (config.should_global_reduce()) {
value = global_reduce<1>(value, acc, shared_memory);
} else if (config.should_store(output_idx)) {
if (accumulate) {
#pragma unroll
for (int i = 0; i < 1; i++) {
value[i] = reducer::translate_idx(value[i], base_idx);
}
}
if (acc == nullptr) {
if (accumulate) {
value = accumulate_in_output<1>(out, value);
}
if (final_output) {
set_results_to_output<1>(value, base_offsets);
} else {
#pragma unroll
for (int i = 0; i < 1; i++) {
*(out[i]) = get_accumulated_output(out[i], value[i]);
}
}
} else {
if (accumulate) {
#pragma unroll
for (int i = 0; i < 1; i++) {
value[i] = reducer::combine((*acc)[i], value[i]);
}
}
if (final_output) {
set_results_to_output<1>(value, base_offsets);
} else {
*acc = value;
}
}
}
}
template
C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { uint32_t end = config.num_inputs;
// Handle the head of input slice where data is not aligned
arg_t value = ident;
constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
constexpr int align_elements = align_bytes / sizeof(scalar_t);
int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
if (shift > 0) {
data -= shift;
end += shift;
if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
}
end -= align_elements;
data += align_elements;
shift = align_elements - shift;
}
// Do the vectorized reduction
using load_t = aligned_vector<scalar_t, input_vec_size>;
uint32_t idx = config.input_idx();
const uint32_t stride = config.step_input;
// Multiple accumulators to remove dependency between unrolled loops.
arg_t value_list[input_vec_size];
value_list[0] = value;
#pragma unroll
for (int i = 1; i < input_vec_size; i++) {
value_list[i] = ident;
}
scalar_t values[input_vec_size];
load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
while (idx * input_vec_size + input_vec_size - 1 < end) {
*values_vector = reinterpret_cast<const load_t*>(data)[idx];
#pragma unroll
for (uint32_t i = 0; i < input_vec_size; i++) {
value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
}
idx += stride;
}
// tail
uint32_t tail_start = end - end % input_vec_size;
if (config.should_reduce_tail()) {
int idx = tail_start + threadIdx.x;
if (idx < end) {
value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
}
}
// combine accumulators
#pragma unroll
for (int i = 1; i < input_vec_size; i++) {
value_list[0] = reducer::combine(value_list[0], value_list[i]);
}
return value_list[0];
}
template <int output_vec_size, typename offset_calc_t> C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalart* data, offset_calc_t calc) const { uint32_t idx = config.input_idx(); const uint32_t end = config.num_inputs; const uint32_t stride = config.step_input; const int vt0=4;
using arg_vec_t = Array<arg_t, output_vec_size>;
using load_t = aligned_vector<scalar_t, output_vec_size>;
const load_t* data = reinterpret_cast<const load_t*>(data_);
// Multiple accumulators to remove dependency between unrolled loops.
arg_vec_t value_list[vt0];
#pragma unroll
for (int i = 0; i < vt0; i++) {
#pragma unroll
for (int j = 0; j < output_vec_size; j++) {
value_list[i][j] = ident;
}
}
load_t values[vt0];
while (idx + (vt0 - 1) * stride < end) {
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
values[i] = data[calc(idx + i * stride) / output_vec_size];
}
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
#pragma unroll
for (uint32_t j = 0; j < output_vec_size; j++) {
value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
}
}
idx += stride * vt0;
}
// tail
int idx_ = idx;
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
if (idx >= end) {
break;
}
values[i] = data[calc(idx) / output_vec_size];
idx += stride;
}
idx = idx_;
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
if (idx >= end) {
break;
}
#pragma unroll
for (uint32_t j = 0; j < output_vec_size; j++) {
value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
}
idx += stride;
}
// combine accumulators
#pragma unroll
for (int i = 1; i < vt0; i++) {
#pragma unroll
for (uint32_t j = 0; j < output_vec_size; j++) {
value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
}
}
return value_list[0];
}
template
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], other[i]);
}
shared[address_base] = value;
}
}
dim_x = warpSize;
}
__syncthreads();
for (int offset = 1; offset < dim_x; offset <<= 1) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
arg_t other = reducer::warp_shfl_down(value[i], offset);
value[i] = reducer::combine(value[i], other);
}
}
return value;
}
template
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], other[i]);
}
shared[config.shared_memory_offset(0)] = value;
}
}
return value;
}
C10_DEVICE bool mark_block_finished() const { shared bool is_last_block_done_shared;
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0) {
int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
}
__syncthreads();
return is_last_block_done_shared;
}
template
for (int i = 0; i < output_vec_size; i++) {
ret[i] = reducer::combine(*(out[i]), value[i]);
}
return ret;
}
C10_DEVICE out_scalar_t get_accumulated_output( out_scalar_t* out, arg_t value ) const { assert(!final_output); return (out_scalar_t)value; }
template
//TODO - multi-output reduction - we won't be able to use thrust::pair //just explicitly specify typed output reads/writes //Currently implemented for max of two outputs // template<class T1, class T2> // C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const { // if (noutputs >= 1) { // auto res0 = (T1)((char)dst[0] + base_offset); // res0 = x.first; // } // if (noutputs >= 2) { // // base offset is computed assuming element size being sizeof(T1), so we need to make a // // correction to obtain the correct base offset // auto res1 = (T2) ((char ) dst[1] + base_offset / sizeof(T1) sizeof(T2)); // *res1 = x.second; // } // }
template
for (int i = 0; i < output_vec_size; i++) {
set_results(reducer::project(value[i]), base_offset[i]);
}
}
template
arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
uint32_t output_idx = config.output_idx<output_vec_size>();
offset_vec_t base_offsets;
out_ptr_vec_t out;
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
base_offsets[i] = output_calc.get(output_idx + i)[0];
out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
}
bool should_store = config.should_store(output_idx);
if (should_store) {
uint32_t offset = config.staging_memory_offset(blockIdx.y);
reduce_buffer[offset] = value;
}
__threadfence(); // make sure writes are globally visible
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
bool is_last_block_done = mark_block_finished();
if (is_last_block_done) {
value = ident;
if (config.should_block_x_reduce()) {
uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
uint32_t step = blockDim.x * blockDim.y;
for (; input_offset < config.ctas_per_output; input_offset += step) {
uint32_t idx = config.staging_memory_offset(input_offset);
arg_vec_t next = reduce_buffer[idx];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], next[i]);
}
}
} else {
uint32_t input_offset = threadIdx.y;
uint32_t step = blockDim.y;
for (; input_offset < config.ctas_per_output; input_offset += step) {
uint32_t idx = config.staging_memory_offset(input_offset);
arg_vec_t next = reduce_buffer[idx];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], next[i]);
}
}
}
value = block_y_reduce(value, shared_memory);
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
if (should_store) {
if (accumulate) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::translate_idx(value[i], base_idx);
}
}
if (acc == nullptr) {
if (accumulate) {
value = accumulate_in_output<output_vec_size>(out, value);
}
if (final_output) {
set_results_to_output<output_vec_size>(value, base_offsets);
} else {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
*(out[i]) = get_accumulated_output(out[i], value[i]);
}
}
} else {
if (accumulate) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine((*acc)[i], value[i]);
}
}
if (final_output) {
set_results_to_output<output_vec_size>(value, base_offsets);
} else {
*acc = value;
}
}
}
}
return value;
} };
Failures:
Hi, I've been unsuccessfully trying to train a custom dataset. Getting the following errors, for single GPU:
And for multiple gpu mode:
I've got CUDA 11.3 runtime version installed in my ubuntu machine. I've tried with these pytorch version that are compatible according to requirements: 1.11.0 and 1.10.1
Please any help would be appreciated. Thanks.