Closed hehuaijin closed 1 year ago
this is full fixed code // Hossein Moein // September 12, 2017 /* Copyright (c) 2019-2026, Hossein Moein All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Hossein Moein BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
// ----------------------------------------------------------------------------
namespace hmdf {
template<typename I, typename H> template<typename RHS_T, typename ... Ts> DataFrame<I, H> DataFrame<I, H>:: join_by_index (const RHS_T &rhs, join_policy mp) const {
static_assert(
std::is_base_of<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value,
"The rhs argument to join_by_index() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>");
const auto &lhs_idx = get_index();
const auto &rhs_idx = rhs.get_index();
const size_type lhs_idx_s = lhs_idx.size();
const size_type rhs_idx_s = rhs_idx.size();
StlVecType<JoinSortingPair<IndexType>> idx_vec_lhs;
StlVecType<JoinSortingPair<IndexType>> idx_vec_rhs;
idx_vec_lhs.reserve(lhs_idx_s);
for (size_type i = 0; i < lhs_idx_s; ++i)
idx_vec_lhs.push_back(std::make_pair(&(lhs_idx[i]), i));
idx_vec_rhs.reserve(rhs_idx_s);
for (size_type i = 0; i < rhs_idx_s; ++i)
idx_vec_rhs.push_back(std::make_pair(&(rhs_idx[i]), i));
auto cf = [] (const JoinSortingPair<IndexType> &l,
const JoinSortingPair<IndexType> &r) -> bool {
return (*(l.first) < *(r.first));
};
std::sort(idx_vec_lhs.begin(), idx_vec_lhs.end(), cf);
std::sort(idx_vec_rhs.begin(), idx_vec_rhs.end(), cf);
switch(mp) {
case join_policy::inner_join:
return (index_inner_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
case join_policy::left_join:
return (index_left_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
case join_policy::right_join:
return (index_right_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
case join_policy::left_right_join:
default:
return (index_left_right_join_
<decltype(*this), RHS_T, Ts ...>
(*this, rhs, idx_vec_lhs, idx_vec_rhs));
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, H> DataFrame<I, H>:: join_by_column (const RHS_T &rhs, const char *name, join_policy mp) const {
static_assert(
std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value,
"The rhs argument to join_by_column() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>");
const auto &lhs_vec = get_column<T>(name);
const auto &rhs_vec = rhs.template get_column<T>(name);
const size_type lhs_vec_s = lhs_vec.size();
const size_type rhs_vec_s = rhs_vec.size();
StlVecType<JoinSortingPair<T>> col_vec_lhs;
StlVecType<JoinSortingPair<T>> col_vec_rhs;
col_vec_lhs.reserve(lhs_vec_s);
for (size_type i = 0; i < lhs_vec_s; ++i)
col_vec_lhs.push_back(std::make_pair(&(lhs_vec[i]), i));
col_vec_rhs.reserve(rhs_vec_s);
for (size_type i = 0; i < rhs_vec_s; ++i)
col_vec_rhs.push_back(std::make_pair(&(rhs_vec[i]), i));
auto cf = [] (const JoinSortingPair<T> &l,
const JoinSortingPair<T> &r) -> bool {
return (*(l.first) < *(r.first));
};
std::sort(col_vec_lhs.begin(), col_vec_lhs.end(), cf);
std::sort(col_vec_rhs.begin(), col_vec_rhs.end(), cf);
switch(mp) {
case join_policy::inner_join:
return (column_inner_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
case join_policy::left_join:
return (column_left_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
case join_policy::right_join:
return (column_right_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
case join_policy::left_right_join:
default:
return (column_left_right_join_
<decltype(*this), RHS_T, T, Ts ...>
(*this, rhs, name, col_vec_lhs, col_vec_rhs));
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename IDX_T, typename ... Ts> void DataFrame<I, H>:: join_helpercommon( const LHS_T &lhs, const RHS_T &rhs, const IndexIdxVector &joined_index_idx, DataFrame<IDX_T, HeteroVector<std::size_t(H::align_value)>> &result, const char *skip_col_name) {
const SpinGuard guard(lock_);
// Load the common and lhs columns
for (const auto &iter : lhs.column_list_) {
auto rhs_citer = rhs.column_tb_.find(iter.first);
if (skip_col_name && iter.first == skip_col_name) continue;
// Common column between two frames
if (rhs_citer != rhs.column_tb_.end()) {
index_join_functor_common_<decltype(result), Ts ...> functor(
iter.first.c_str(),
rhs,
joined_index_idx,
result);
lhs.data_[iter.second].change(functor);
}
else { // lhs only column
// 0 = Left
index_join_functor_oneside_<0, decltype(result), Ts ...> functor (
iter.first.c_str(),
joined_index_idx,
result);
lhs.data_[iter.second].change(functor);
}
}
// Load the rhs columns
for (const auto &iter : rhs.column_list_) {
auto lhs_citer = lhs.column_tb_.find(iter.first);
if (skip_col_name && iter.first == skip_col_name) continue;
if (lhs_citer == lhs.column_tb_.end()) { // rhs only column
// 1 = Right
index_join_functor_oneside_<1, decltype(result), Ts ...> functor (
iter.first.c_str(),
joined_index_idx,
result);
rhs.data_[iter.second].change(functor);
}
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: index_joinhelper(const LHS_T &lhs, const RHS_T &rhs, const IndexIdxVector &joined_index_idx) {
DataFrame<IndexType, HeteroVector<align_value>> result;
StlVecType<IndexType> result_index;
// Load the index
result_index.reserve(joined_index_idx.size());
for (auto citer : joined_index_idx) {
const size_type left_i = std::get<0>(citer);
result_index.push_back(
left_i != std::numeric_limits<size_type>::max()
? lhs.indices_[left_i] : rhs.indices_[std::get<1>(citer)]);
}
result.load_index(std::move(result_index));
join_helper_common_<LHS_T, RHS_T, IndexType, Ts ...>
(lhs, rhs, joined_index_idx, result);
return(result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: column_joinhelper(const LHS_T &lhs, const RHS_T &rhs, const char *col_name, const IndexIdxVector &joined_index_idx) {
using left_idx_t = typename std::remove_reference<LHS_T>::type::IndexType;
using right_idx_t = typename std::remove_reference<RHS_T>::type::IndexType;
const size_type jii_s =
joined_index_idx.size();
DataFrame<unsigned int, HeteroVector<align_value>> result;
// Load the new result index
result.load_index(
DataFrame<unsigned int, HeteroVector<align_value>>::gen_sequence_index(
0, static_cast<unsigned int>(jii_s), 1));
// Load the lhs and rhs indices into two columns in the result
// Also load the unified named column
StlVecType<left_idx_t> lhs_index;
StlVecType<right_idx_t> rhs_index;
StlVecType<T> named_col_vec;
const ColumnVecType<T> &lhs_named_col_vec =
lhs.template get_column<T>(col_name);
const ColumnVecType<T> &rhs_named_col_vec =
rhs.template get_column<T>(col_name);
lhs_index.reserve(jii_s);
rhs_index.reserve(jii_s);
named_col_vec.reserve(jii_s);
for (auto citer : joined_index_idx) {
const size_type left_i = std::get<0>(citer);
const size_type right_i = std::get<1>(citer);
if (left_i != std::numeric_limits<size_type>::max()) {
lhs_index.push_back(lhs.indices_[left_i]);
named_col_vec.push_back(lhs_named_col_vec[left_i]);
}
else {
named_col_vec.push_back(rhs_named_col_vec[right_i]);
lhs_index.push_back(get_nan<left_idx_t>());
}
if (right_i != std::numeric_limits<size_type>::max())
rhs_index.push_back(rhs.indices_[right_i]);
else
rhs_index.push_back(get_nan<right_idx_t>());
}
{
char buffer[64];
const SpinGuard guard(lock_);
::snprintf(buffer, sizeof(buffer) - 1, "lhs.%s", DF_INDEX_COL_NAME);
result.template load_column<left_idx_t>(buffer,
std::move(lhs_index),
nan_policy::pad_with_nans,
false);
::snprintf(buffer, sizeof(buffer) - 1, "rhs.%s", DF_INDEX_COL_NAME);
result.template load_column<right_idx_t>(buffer,
std::move(rhs_index),
nan_policy::pad_with_nans,
false);
result.template load_column<T>(col_name,
std::move(named_col_vec),
nan_policy::pad_with_nans,
false);
}
join_helper_common_<LHS_T, RHS_T, unsigned int, Ts ...>
(lhs, rhs, joined_index_idx, result, col_name);
return(result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(std::min(lhs_end, rhs_end));
while (lhs_current != lhs_end && rhs_current != rhs_end) {
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first))
lhs_current += 1;
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else //add this row to fix
rhs_current += 1;
}
}
return (joined_index_idx);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
index_innerjoin(const LHS_T &lhs,
const RHS_T &rhs,
const StlVecType<JoinSortingPair
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_inner_index_idx_vector_<IndexType>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>>
DataFrame<I, H>::
column_innerjoin(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_inner_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(lhs_end);
while (lhs_current != lhs_end || rhs_current != rhs_end) {
if (lhs_current >= lhs_end) break;
if (rhs_current >= rhs_end) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
continue;
}
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else // add this row fix
rhs_current += 1;
}
}
return (joined_index_idx);
} // ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
index_leftjoin(const LHS_T &lhs, const RHS_T &rhs,
const StlVecType<JoinSortingPair
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_left_index_idx_vector_<IndexType>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>>
DataFrame<I, H>::
column_leftjoin(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_left_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(rhs_end);
while (lhs_current != lhs_end || rhs_current != rhs_end) {
if (rhs_current >= rhs_end) break;
if (lhs_current >= lhs_end) {
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current++].second);
continue;
}
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first))
lhs_current += 1;
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else{
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current].second);
rhs_current += 1;
}
}
}
return (joined_index_idx);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
index_rightjoin(const LHS_T &lhs, const RHS_T &rhs,
const StlVecType<JoinSortingPair
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_right_index_idx_vector_<IndexType>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>>
DataFrame<I, H>::
column_rightjoin(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_right_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template
size_type lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector joined_index_idx;
joined_index_idx.reserve(std::max(lhs_end, rhs_end));
while (lhs_current != lhs_end || rhs_current != rhs_end) {
if (lhs_current >= lhs_end && rhs_current < rhs_end) {
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current++].second);
continue;
}
if (rhs_current >= rhs_end && lhs_current < lhs_end) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
continue;
}
if (*(col_vec_lhs[lhs_current].first) <
*(col_vec_rhs[rhs_current].first)) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits<size_type>::max());
}
else {
if (*(col_vec_lhs[lhs_current].first) ==
*(col_vec_rhs[rhs_current].first))
joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second,
col_vec_rhs[rhs_current].second);
else
{ //add this row to fix
joined_index_idx.emplace_back(
std::numeric_limits<size_type>::max(),
col_vec_rhs[rhs_current].second);
rhs_current += 1;
} //add this row to fix
}
}
return (joined_index_idx);
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
index_left_rightjoin(
const LHS_T &lhs,
const RHS_T &rhs,
const StlVecType<JoinSortingPair
return (index_join_helper_<LHS_T, RHS_T, Ts ...>
(lhs, rhs,
get_left_right_index_idx_vector_<IndexType>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H>
template<typename LHS_T, typename RHS_T, typename T, typename ... Ts>
DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>>
DataFrame<I, H>::
column_left_rightjoin(const LHS_T &lhs,
const RHS_T &rhs,
const char *col_name,
const StlVecType<JoinSortingPair
return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
(lhs, rhs, col_name,
get_left_right_index_idx_vector_<T>(col_vec_lhs,
col_vec_rhs)));
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> void DataFrame<I, H>:: concathelper(LHS_T &lhs, const RHS_T &rhs, bool add_new_columns) {
const size_type orig_index_s = lhs.get_index().size();
lhs.get_index().insert(lhs.get_index().end(),
rhs.get_index().begin(), rhs.get_index().end());
// Load common columns
for (const auto &lhs_iter : lhs.column_list_) {
auto rhs_citer = rhs.column_tb_.find(lhs_iter.first);
if (rhs_citer != rhs.column_tb_.end()) {
concat_functor_<LHS_T, Ts ...> functor(lhs_iter.first.c_str(),
lhs,
false,
orig_index_s);
rhs.data_[rhs_citer->second].change(functor);
}
}
// Load columns from rhs that do not exist in lhs
if (add_new_columns) {
for (const auto &rhs_citer : rhs.column_list_) {
auto lhs_iter = lhs.column_tb_.find(rhs_citer.first);
if (lhs_iter == lhs.column_tb_.end()) {
concat_functor_<LHS_T, Ts ...> functor(rhs_citer.first.c_str(),
lhs,
true,
orig_index_s);
rhs.data_[rhs_citer.second].change(functor);
}
}
}
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename RHS_T, typename ... Ts> void DataFrame<I, H>::self_concat(const RHS_T &rhs, bool add_new_columns) {
static_assert(
(std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value) &&
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"The rhs argument to self_concat() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>. "
"Self must be StdDataFrame<IndexType>");
const SpinGuard guard(lock_);
concat_helper_<decltype(*this), RHS_T, Ts ...>(*this, rhs, add_new_columns);
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename RHS_T, typename ... Ts> DataFrame<I, H> DataFrame<I, H>::concat(const RHS_T &rhs, concat_policy cp) const {
static_assert(
(std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
std::is_base_of<View, RHS_T>::value ||
std::is_base_of<PtrView, RHS_T>::value) &&
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"The rhs argument to concat() can only be "
"StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>. "
"Self must be StdDataFrame<IndexType>");
DataFrame<I, HeteroVector<align_value>> result;
const SpinGuard guard(lock_);
if (cp == concat_policy::all_columns ||
cp == concat_policy::lhs_and_common_columns) {
result = *this;
concat_helper_<decltype(result), RHS_T, Ts ...>(
result, rhs, cp == concat_policy::all_columns);
}
else if (cp == concat_policy::common_columns) {
result.load_index(this->get_index().begin(), this->get_index().end());
for (const auto &lhs_citer : column_list_) {
auto rhs_citer = rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end()) {
load_all_functor_<Ts ...> functor(lhs_citer.first.c_str(),
result);
data_[lhs_citer.second].change(functor);
}
}
concat_helper_<decltype(result), RHS_T, Ts ...>(result, rhs, false);
}
return (result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename RHS_T, typename ... Ts> typename DataFrame<I, H>::PtrView DataFrame<I, H>::concat_view(RHS_T &rhs, concat_policy cp) {
static_assert(
! std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"Currently, arguments to concat_view() can only be "
"StdDataFrame<IndexType>.");
PtrView result;
using idxvec_t = typename PtrView::IndexVecType;
const size_type idx_s = get_index().size();
const size_type rhs_idx_s = rhs.get_index().size();
idxvec_t result_idx;
result_idx.reserve(idx_s + rhs_idx_s);
for (size_type i = 0; i < idx_s; ++i)
result_idx.push_back(&(get_index()[i]));
for (size_type i = 0; i < rhs_idx_s; ++i)
result_idx.push_back(&(rhs.get_index()[i]));
result.indices_ = std::move(result_idx);
if (cp == concat_policy::all_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
}
for (const auto &rhs_citer : rhs.column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
rhs_citer.first.c_str(), result);
rhs.data_[rhs_citer.second].change(functor);
}
}
else if (cp == concat_policy::lhs_and_common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
auto rhs_citer = rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end())
rhs.data_[rhs_citer->second].change(functor);
}
}
else if (cp == concat_policy::common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<PtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
auto rhs_citer =
rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end()) {
data_[lhs_citer.second].change(functor);
rhs.data_[rhs_citer->second].change(functor);
}
}
}
return (result);
}
// ----------------------------------------------------------------------------
template<typename I, typename H> template<typename RHS_T, typename ... Ts> typename DataFrame<I, H>::ConstPtrView DataFrame<I, H>::concat_view(RHS_T &rhs, concat_policy cp) const {
static_assert(
! std::is_base_of<
DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
RHS_T>::value ||
! std::is_base_of<DataFrame<I,
HeteroVector<std::size_t(H::align_value)>>,
decltype(*this)>::value,
"Currently, arguments to concat_view() can only be "
"StdDataFrame<IndexType>.");
ConstPtrView result;
using idxvec_t = typename ConstPtrView::IndexVecType;
const size_type idx_s = get_index().size();
const size_type rhs_idx_s = rhs.get_index().size();
idxvec_t result_idx;
result_idx.reserve(idx_s + rhs_idx_s);
for (size_type i = 0; i < idx_s; ++i)
result_idx.push_back(&(get_index()[i]));
for (size_type i = 0; i < rhs_idx_s; ++i)
result_idx.push_back(&(rhs.get_index()[i]));
result.indices_ = std::move(result_idx);
if (cp == concat_policy::all_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
}
for (const auto &rhs_citer : rhs.column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
rhs_citer.first.c_str(), result);
rhs.data_[rhs_citer.second].change(functor);
}
}
else if (cp == concat_policy::lhs_and_common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
data_[lhs_citer.second].change(functor);
auto rhs_citer = rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end())
rhs.data_[rhs_citer->second].change(functor);
}
}
else if (cp == concat_policy::common_columns) {
for (const auto &lhs_citer : column_list_) {
concat_load_view_functor_<ConstPtrView, Ts ...> functor(
lhs_citer.first.c_str(), result);
auto rhs_citer =
rhs.column_tb_.find(lhs_citer.first);
if (rhs_citer != rhs.column_tb_.end()) {
data_[lhs_citer.second].change(functor);
rhs.data_[rhs_citer->second].change(functor);
}
}
}
return (result);
}
} // namespace hmdf
// ----------------------------------------------------------------------------
// Local Variables: // mode:C++ // tab-width:4 // c-basic-offset:4 // End:
Thanks you for looking into this. Why don't you submit this as a PR (pull request)? That way it goes through all the testes and you get the credit for it, if you care about that
您好,您的邮件已收到,我会尽快查看并回复您。谢谢!
Tnanks you for replay! I will try learning submit this as a PR!
阿黑 @.***
------------------ 原始邮件 ------------------ 发件人: "Hossein @.>; 发送时间: 2023年4月27日(星期四) 晚上9:30 收件人: @.>; 抄送: @.>; @.>; 主题: Re: [hosseinmoein/DataFrame] dataframe_join.tcc miss { } (Issue #238)
Thanks you for looking into this. Why don't you submit this as a PR (pull request)? That way it goes through all the testes and you get the credit for it, if you care about that
— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you authored the thread.Message ID: @.***>
I looked at this further. I don't think this is a bug. It should work properly. But I could be wrong. Can you show me through an example this is a bug?
Thanks
Say you're sorry,My apologies for the late reply.
This is a sample:
/***/
using namespace hmdf;
// A DataFrame with ulong index type
//
using ULDataFrame = StdDataFrame
// A DataFrame with string index type
//
using StrDataFrame = StdDataFrame
// A DataFrame with DateTime index type
//
using DTDataFrame = StdDataFrame
void test_index_left_join() {
using MyDataFrame = ULDataFrame;
std::cout << "\nTesting Index Left Join ..." << std::endl;
std::vector<unsigned long> idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
std::vector<double> djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 };
std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
std::vector<int> i1 = { 22, 23, 24, 25, 99 };
MyDataFrame df;
df.load_data(std::move(idx),
std::make_pair("djoincol", djoincol1),
std::make_pair("col_1", d1),
std::make_pair("col_2", d2),
std::make_pair("col_3", d3),
std::make_pair("col_4", i1));
std::vector<unsigned long> idx2 =
{1 };
std::vector<double> djoincol2 = { 1 };
std::vector<double> d12 = { 11 };
std::vector<double> d22 = { 18 };
std::vector<double> d32 = { 115 };
std::vector<int> i12 = { 122 };
MyDataFrame df2;
df2.load_data(std::move(idx2),
std::make_pair("djoincol", djoincol2),
std::make_pair("xcol_1", d12),
std::make_pair("col_2", d22),
std::make_pair("xcol_3", d32),
std::make_pair("col_4", i12));
std::cout << "First DF:" << std::endl;
df.write<std::ostream, double, int>(std::cout);
std::cout << "Second DF2:" << std::endl;
df2.write<std::ostream, double, int>(std::cout);
auto join_df =
df.join_by_column<decltype(df2), double, int>(df2,"djoincol", hmdf::join_policy::left_join);
std::cout << "Now The joined DF:" << std::endl;
join_df.write<std::ostream, double, int>(std::cout);
}
write out:
Testing Index Left Join ...
First DF:
INDEX:14:
Second DF2:
INDEX:1:
Now The joined DF:
INDEX:14:
Should be:
djoincol:14:
I am a bit confused. The code that your are saying is wrong and you corrected (line# 577 in file DataFrame_join.tcc) is for the left right
join (aka merge). The code example above is for left
outer join. In other words, the example above doesn't execute the code you corrected.
The similar bug more than one. I post the fix code, and comments by : "//add this row to fix".
Can you please submit a pull request?
I am still confused. Your comments //add this row to fix
are only in two functions
get_inner_index_idx_vector_()
get_left_right_index_idx_vector_()
Neither of these functions is executed in the code sample you posted in test_index_left_join()
get_left_right_index_idxvector:
while (lhs_current != lhs_end || rhs_current != rhs_end) {
.......
if ((col_vec_lhs[lhs_current].first) <
(col_vec_rhs[rhs_current].first)) {
joined_index_idx.emplace_back(
col_vec_lhs[lhs_current++].second,
std::numeric_limits
joined_index_idx.emplace_back(
std::numeric_limits
rhs_current += 1;
} //add this row to fix
}
}
Thank you for looking into this. But I believe the original behavior/code is correct. making your changes will introduce the bug of repeating the RHS values where they shouldn't be there
Also your code sample above is missing some type specifications. The correct code is:
static void test_index_left_join() {
using MyDataFrame = ULDataFrame;
std::cout << "\nTesting Index Left Join ..." << std::endl;
std::vector<unsigned long> idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
std::vector<double> djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 };
std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
std::vector<int> i1 = { 22, 23, 24, 25, 99 };
MyDataFrame df;
df.load_data(std::move(idx),
std::make_pair("djoincol", djoincol1),
std::make_pair("col_1", d1),
std::make_pair("col_2", d2),
std::make_pair("col_3", d3),
std::make_pair("col_4", i1));
std::vector<unsigned long> idx2 = {1 };
std::vector<double> djoincol2 = { 1 };
std::vector<double> d12 = { 11 };
std::vector<double> d22 = { 18 };
std::vector<double> d32 = { 115 };
std::vector<int> i12 = { 122 };
MyDataFrame df2;
df2.load_data(std::move(idx2),
std::make_pair("djoincol", djoincol2),
std::make_pair("xcol_1", d12),
std::make_pair("col_2", d22),
std::make_pair("xcol_3", d32),
std::make_pair("col_4", i12));
std::cout << "First DF:" << std::endl;
df.write<std::ostream, double, int>(std::cout, io_format::csv2);
std::cout << "Second DF2:" << std::endl;
df2.write<std::ostream, double, int>(std::cout, io_format::csv2);
auto join_df =
df.join_by_column<decltype(df2), double, double, int>(df2, "djoincol", hmdf::join_policy::left_join);
std::cout << "Now The joined DF:" << std::endl;
join_df.write<std::ostream, double, int, unsigned long>(std::cout, io_format::csv2);
}
/*left table:*****/
std::vector
/*right table:*****/
djoincol1 = { 1}
std::vector
I believe the left join result should be
d2 = { 2, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan };
The rhs has only one 2
value and if you repeat them it means you are manufacturing data
you can replicate this in a relational database like Postgres
in rows:577 bug code: if ((col_vec_lhs[lhs_current].first) == (col_vec_rhs[rhs_current].first)) joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second, col_vec_rhs[rhs_current].second); else joined_index_idx.emplace_back( std::numeric_limits::max(),
col_vec_rhs[rhs_current].second);
rhs_current += 1;
correct code: if ((col_vec_lhs[lhs_current].first) == (col_vec_rhs[rhs_current].first)) joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second, col_vec_rhs[rhs_current].second); else{ joined_index_idx.emplace_back( std::numeric_limits::max(),
col_vec_rhs[rhs_current].second);
rhs_current += 1;
}
fix when right value match left value only one value.