Open hariharan-devarajan opened 1 year ago
Lassen Machine link
The code hangs and timeouts on two UnifyFS calls. unifyfs_dispatch_transfer and unifyfs_finalize
unifyfs_dispatch_transfer
unifyfs_finalize
for unifyfs_dispatch_transfer it hangs for the UNIFYFS_TRANSFER_MODE_MOVE but works for UNIFYFS_TRANSFER_MODE_COPY
for unifyfs_finalize the function timeouts at RPC.
unifyfs_handle fshdl; options_ct = 4; unifyfs_cfg_option *options = static_cast<unifyfs_cfg_option *>( calloc(options_ct, sizeof(unifyfs_cfg_option))); options[0] = {.opt_name = "logio.spill_dir", .opt_value = logio_spill_dir}; options[1] = {.opt_name = "logio.spill_size", .opt_value = logio_spill_size}; options[2] = {.opt_name = "logio.shmem_size", .opt_value = logio_shmem_size}; options[3] = {.opt_name = "logio.chunk_size", .opt_value = logio_chunk_size}; int rc = unifyfs_initialize(info.unifyfs_path.c_str(), options, options_ct, fshdl); REQUIRE(rc == UNIFYFS_SUCCESS); fs::path unifyfs_filename = info.unifyfs_path / filename; unifyfs_gfid gfid = 0; int rc = UNIFYFS_SUCCESS; int create_flags = 0; open_time.resumeTime(); rc = unifyfs_create(fshdl, create_flags, unifyfs_filename.c_str(), &gfid); open_time.pauseTime(); INFO("unifyfs rc " << strerror(rc)); REQUIRE(rc == UNIFYFS_SUCCESS); REQUIRE(gfid != UNIFYFS_INVALID_GFID); if (info.rank == 0) INFO("Writing data"); /* Write data to file */ auto write_data = std::vector<char>(args.request_size * args.iteration, 'w'); size_t write_req_ct = args.iteration + 1; unifyfs_io_request write_req[write_req_ct]; for (size_t i = 0; i < args.iteration; ++i) { write_req[i].op = UNIFYFS_IOREQ_OP_WRITE; write_req[i].gfid = gfid; write_req[i].nbytes = args.request_size; off_t base_offset = 0; if (args.file_sharing == tt::FileSharing::SHARED_FILE) { base_offset = (off_t)info.rank * args.request_size * args.iteration; } off_t relative_offset = i * args.request_size; write_req[i].offset = base_offset + relative_offset; write_req[i].user_buf = write_data.data() + (i * args.request_size); } write_req[args.iteration].op = UNIFYFS_IOREQ_OP_SYNC_META; write_req[args.iteration].gfid = gfid; rc = unifyfs_dispatch_io(fshdl, write_req_ct, write_req); if (rc == UNIFYFS_SUCCESS) { int waitall = 1; rc = unifyfs_wait_io(fshdl, write_req_ct, write_req, waitall); if (rc == UNIFYFS_SUCCESS) { for (size_t i = 0; i < args.iteration; i++) { REQUIRE(write_req[i].result.error == 0); REQUIRE(write_req[i].result.count == args.request_size); } REQUIRE(write_req[args.iteration].result.error == 0); } } MPI_Barrier(MPI_COMM_WORLD); if (info.rank == 0) PRINT_MSG("Finished Writing", ""); if (info.rank == 0) INFO("Flushing data"); unifyfs_transfer_request mv_req; mv_req.src_path = unifyfs_filename.c_str(); mv_req.dst_path = full_filename_path.c_str(); mv_req.mode = UNIFYFS_TRANSFER_MODE_MOVE; mv_req.use_parallel = 1; rc = unifyfs_dispatch_transfer(fshdl, 1, &mv_req); REQUIRE(rc == UNIFYFS_SUCCESS); if (rc == UNIFYFS_SUCCESS) { int waitall = 1; rc = unifyfs_wait_transfer(fshdl, 1, &mv_req, waitall); if (rc == UNIFYFS_SUCCESS) { for (int i = 0; i < (int)1; i++) { REQUIRE(mv_req.result.error == 0); } } } MPI_Barrier(MPI_COMM_WORLD); rc = unifyfs_finalize(fshdl);
System information
Lassen Machine link
Describe the problem you're observing
The code hangs and timeouts on two UnifyFS calls.
unifyfs_dispatch_transfer
andunifyfs_finalize
for
unifyfs_dispatch_transfer
it hangs for the UNIFYFS_TRANSFER_MODE_MOVE but works for UNIFYFS_TRANSFER_MODE_COPYfor
unifyfs_finalize
the function timeouts at RPC.Describe how to reproduce the problem
Include any warning or errors or releveant debugging data