The first time we did burst buffer, we branched the code and added a bit of code which never made it back into master. We need to make sure all this code is in master:
int plfs_protect_all(const char *file, MPI_Comm comm);
int plfs_protect_all(const char *file, MPI_Comm comm) {
int rank;
MPI_Comm_rank(comm,&rank);
return plfs_protect(file,rank);
}
This is in container_internals.cpp right before the plfs_protect function.
// TODO: this code assumes that replication is done
// if replication is still active, removing these files
// will break replication and corrupt the file
// TODO: should this function be in this file
int
plfs_trim(const char *logical, pid_t pid)
{
PLFS_ENTER;
mlog(INT_DAPI, "%s on %s with %d\n",FUNCTION,logical,pid);
// this should be called after the plfs_protect is done
// currently it doesn't check to make sure that the plfs_protect
// was successful
// find all the paths
// shadow is the current shadowed subdir
// replica is the tempory, currently inaccessible, subdir in canonical
// metalink is the path to the current metalink in canonical
// we assume all the droppings in the shadow have been replicated so
// 1) rename replica to metalink (it will now be a canonical subdir)
// 2) remove all droppings owned by this pid
// 3) clean up the shadow container
ContainerPaths paths;
ret = findContainerPaths(logical,paths);
if (ret != 0) {
PLFS_EXIT(ret);
}
string replica = Container::getHostDirPath(paths.canonical,Util::hostname(),
TMP_SUBDIR);
string metalink = paths.canonical_hostdir;
// rename replica over metalink currently at paths.canonical_hostdir
// this could fail if a sibling was faster than us
// unfortunately it appears that rename of a dir over a metalink not atomic
mlog(INT_DCOMMON, "%s rename %s -> %s\n",FUNCTION,replica.c_str(),
paths.canonical_hostdir.c_str());
// remove the metalink
UnlinkOp op;
ret = op.op(paths.canonical_hostdir.c_str(),DT_LNK);
if (ret != 0 && (errno==ENOENT || errno==EISDIR)) {
// Ignore ENOENT and ISDIR, another sibling might have renamed it.
ret = 0;
}
if (ret != 0) {
PLFS_EXIT(ret);
}
// rename the replica at the right location
ret = Util::Rename(replica.c_str(),paths.canonical_hostdir.c_str());
if (ret != 0 && errno==ENOENT) {
ret = 0;
}
if (ret != 0) {
PLFS_EXIT(ret);
}
// remove all the droppings in paths.shadow_hostdir
set droppings;
ret = plfs_find_my_droppings(paths.shadow_hostdir,pid,droppings);
if (ret != 0) {
PLFS_EXIT(ret);
}
set::iterator itr;
for (itr=droppings.begin(); itr!=droppings.end(); itr++) {
ret = op.op(itr->c_str(),DT_REG);
if (ret!=0) {
PLFS_EXIT(ret);
}
}
// now remove paths.shadow_hostdir (which might fail due to slow siblings)
// then remove paths.shadow (which might fail due to slow siblings)
// the slowest sibling will succeed in removing the shadow container
op.ignoreErrno(ENOENT); // sibling beat us
op.ignoreErrno(ENOTEMPTY); // we beat sibling
ret = op.op(paths.shadow_hostdir.c_str(),DT_DIR);
if (ret!=0) {
PLFS_EXIT(ret);
}
ret = op.op(paths.shadow.c_str(),DT_DIR);
if (ret!=0) {
PLFS_EXIT(ret);
}
PLFS_EXIT(ret);
}
// iterate through container. Find all pieces owned by this pid that are in
// shadowed subdirs. Currently do this is a non-transaction unsafe method
// that assumes no failure in the middle.
// 1) blow away metalink in canonical
// 2) create a subdir in canonical
// 3) call SYNCER to move each piece owned by this pid in this subdir
This was in plfs.h:
/* this is to move shadowed files into canonical backends */
int plfs_protect(const char *path, pid_t pid);
/* this is delete shadowed files from shadow backends */
int plfs_trim(const char *logical, pid_t pid);
/* query a plfs_fd about how many writers and readers are using it,
* and the bytes written by user, the lazy_stat flag.
*/
int plfs_query( Plfs_fd *, size_t *writers, size_t *readers,
size_t *bytes_written, int *lazy_stat);
The first time we did burst buffer, we branched the code and added a bit of code which never made it back into master. We need to make sure all this code is in master:
int plfs_protect_all(const char *file, MPI_Comm comm);
int plfs_protect_all(const char *file, MPI_Comm comm) { int rank; MPI_Comm_rank(comm,&rank); return plfs_protect(file,rank); }
This is in container_internals.cpp right before the plfs_protect function.
// TODO: this code assumes that replication is done // if replication is still active, removing these files // will break replication and corrupt the file // TODO: should this function be in this file int plfs_trim(const char *logical, pid_t pid) { PLFS_ENTER; mlog(INT_DAPI, "%s on %s with %d\n",FUNCTION,logical,pid); // this should be called after the plfs_protect is done // currently it doesn't check to make sure that the plfs_protect // was successful // find all the paths // shadow is the current shadowed subdir // replica is the tempory, currently inaccessible, subdir in canonical // metalink is the path to the current metalink in canonical // we assume all the droppings in the shadow have been replicated so // 1) rename replica to metalink (it will now be a canonical subdir) // 2) remove all droppings owned by this pid // 3) clean up the shadow container ContainerPaths paths; ret = findContainerPaths(logical,paths); if (ret != 0) { PLFS_EXIT(ret); } string replica = Container::getHostDirPath(paths.canonical,Util::hostname(), TMP_SUBDIR); string metalink = paths.canonical_hostdir; // rename replica over metalink currently at paths.canonical_hostdir // this could fail if a sibling was faster than us // unfortunately it appears that rename of a dir over a metalink not atomic mlog(INT_DCOMMON, "%s rename %s -> %s\n",FUNCTION,replica.c_str(), paths.canonical_hostdir.c_str()); // remove the metalink UnlinkOp op; ret = op.op(paths.canonical_hostdir.c_str(),DT_LNK); if (ret != 0 && (errno==ENOENT || errno==EISDIR)) { // Ignore ENOENT and ISDIR, another sibling might have renamed it. ret = 0; } if (ret != 0) { PLFS_EXIT(ret); } // rename the replica at the right location ret = Util::Rename(replica.c_str(),paths.canonical_hostdir.c_str()); if (ret != 0 && errno==ENOENT) { ret = 0; } if (ret != 0) { PLFS_EXIT(ret); } // remove all the droppings in paths.shadow_hostdir set droppings;
ret = plfs_find_my_droppings(paths.shadow_hostdir,pid,droppings);
if (ret != 0) {
PLFS_EXIT(ret);
}
set::iterator itr;
for (itr=droppings.begin(); itr!=droppings.end(); itr++) {
ret = op.op(itr->c_str(),DT_REG);
if (ret!=0) {
PLFS_EXIT(ret);
}
}
// now remove paths.shadow_hostdir (which might fail due to slow siblings)
// then remove paths.shadow (which might fail due to slow siblings)
// the slowest sibling will succeed in removing the shadow container
op.ignoreErrno(ENOENT); // sibling beat us
op.ignoreErrno(ENOTEMPTY); // we beat sibling
ret = op.op(paths.shadow_hostdir.c_str(),DT_DIR);
if (ret!=0) {
PLFS_EXIT(ret);
}
ret = op.op(paths.shadow.c_str(),DT_DIR);
if (ret!=0) {
PLFS_EXIT(ret);
}
PLFS_EXIT(ret);
}
// iterate through container. Find all pieces owned by this pid that are in // shadowed subdirs. Currently do this is a non-transaction unsafe method // that assumes no failure in the middle. // 1) blow away metalink in canonical // 2) create a subdir in canonical // 3) call SYNCER to move each piece owned by this pid in this subdir
This was in plfs.h: