hosseinmoein / DataFrame

C++ DataFrame for statistical, Financial, and ML analysis -- in modern C++ using native types and contiguous memory storage
https://hosseinmoein.github.io/DataFrame/
BSD 3-Clause "New" or "Revised" License
2.41k stars 306 forks source link

dataframe_join.tcc miss { } #238

Closed hehuaijin closed 1 year ago

hehuaijin commented 1 year ago

in rows:577 bug code: if ((col_vec_lhs[lhs_current].first) == (col_vec_rhs[rhs_current].first)) joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second, col_vec_rhs[rhs_current].second); else joined_index_idx.emplace_back( std::numeric_limits::max(), col_vec_rhs[rhs_current].second); rhs_current += 1;

correct code: if ((col_vec_lhs[lhs_current].first) == (col_vec_rhs[rhs_current].first)) joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second, col_vec_rhs[rhs_current].second); else{ joined_index_idx.emplace_back( std::numeric_limits::max(), col_vec_rhs[rhs_current].second); rhs_current += 1; }

fix when right value match left value only one value.

hehuaijin commented 1 year ago

this is full fixed code // Hossein Moein // September 12, 2017 /* Copyright (c) 2019-2026, Hossein Moein All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Hossein Moein BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

include <DataFrame/DataFrame.h>

include

// ----------------------------------------------------------------------------

namespace hmdf {

template<typename I, typename H> template<typename RHS_T, typename ... Ts> DataFrame<I, H> DataFrame<I, H>:: join_by_index (const RHS_T &rhs, join_policy mp) const {

static_assert(
    std::is_base_of<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>,
                    RHS_T>::value ||
    std::is_base_of<View, RHS_T>::value ||
    std::is_base_of<PtrView, RHS_T>::value,
    "The rhs argument to join_by_index() can only be "
    "StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>");

const auto                              &lhs_idx = get_index();
const auto                              &rhs_idx = rhs.get_index();
const size_type                         lhs_idx_s = lhs_idx.size();
const size_type                         rhs_idx_s = rhs_idx.size();
StlVecType<JoinSortingPair<IndexType>>  idx_vec_lhs;
StlVecType<JoinSortingPair<IndexType>>  idx_vec_rhs;

idx_vec_lhs.reserve(lhs_idx_s);
for (size_type i = 0; i < lhs_idx_s; ++i)
    idx_vec_lhs.push_back(std::make_pair(&(lhs_idx[i]), i));
idx_vec_rhs.reserve(rhs_idx_s);
for (size_type i = 0; i < rhs_idx_s; ++i)
    idx_vec_rhs.push_back(std::make_pair(&(rhs_idx[i]), i));

auto    cf = [] (const JoinSortingPair<IndexType> &l,
                 const JoinSortingPair<IndexType> &r) -> bool  {
                 return (*(l.first) < *(r.first));
             };

std::sort(idx_vec_lhs.begin(), idx_vec_lhs.end(), cf);
std::sort(idx_vec_rhs.begin(), idx_vec_rhs.end(), cf);

switch(mp)  {
    case join_policy::inner_join:
        return (index_inner_join_
                    <decltype(*this), RHS_T, Ts ...>
                (*this, rhs, idx_vec_lhs, idx_vec_rhs));
    case join_policy::left_join:
        return (index_left_join_
                    <decltype(*this), RHS_T, Ts ...>
                (*this, rhs, idx_vec_lhs, idx_vec_rhs));
    case join_policy::right_join:
        return (index_right_join_
                    <decltype(*this), RHS_T, Ts ...>
                (*this, rhs, idx_vec_lhs, idx_vec_rhs));
    case join_policy::left_right_join:
    default:
        return (index_left_right_join_
                    <decltype(*this), RHS_T, Ts ...>
                (*this, rhs, idx_vec_lhs, idx_vec_rhs));
}

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, H> DataFrame<I, H>:: join_by_column (const RHS_T &rhs, const char *name, join_policy mp) const {

static_assert(
    std::is_base_of<
        DataFrame<I,
                  HeteroVector<std::size_t(H::align_value)>>,
                  RHS_T>::value ||
    std::is_base_of<View, RHS_T>::value ||
    std::is_base_of<PtrView, RHS_T>::value,
    "The rhs argument to join_by_column() can only be "
    "StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>");

const auto      &lhs_vec = get_column<T>(name);
const auto      &rhs_vec = rhs.template get_column<T>(name);
const size_type lhs_vec_s = lhs_vec.size();
const size_type rhs_vec_s = rhs_vec.size();

StlVecType<JoinSortingPair<T>>  col_vec_lhs;
StlVecType<JoinSortingPair<T>>  col_vec_rhs;

col_vec_lhs.reserve(lhs_vec_s);
for (size_type i = 0; i < lhs_vec_s; ++i)
    col_vec_lhs.push_back(std::make_pair(&(lhs_vec[i]), i));
col_vec_rhs.reserve(rhs_vec_s);
for (size_type i = 0; i < rhs_vec_s; ++i)
    col_vec_rhs.push_back(std::make_pair(&(rhs_vec[i]), i));

auto    cf = [] (const JoinSortingPair<T> &l,
                 const JoinSortingPair<T> &r) -> bool  {
                 return (*(l.first) < *(r.first));
             };

std::sort(col_vec_lhs.begin(), col_vec_lhs.end(), cf);
std::sort(col_vec_rhs.begin(), col_vec_rhs.end(), cf);

switch(mp)  {
    case join_policy::inner_join:
        return (column_inner_join_
                    <decltype(*this), RHS_T, T, Ts ...>
                        (*this, rhs, name, col_vec_lhs, col_vec_rhs));
    case join_policy::left_join:
        return (column_left_join_
                    <decltype(*this), RHS_T, T, Ts ...>
                        (*this, rhs, name, col_vec_lhs, col_vec_rhs));
    case join_policy::right_join:
        return (column_right_join_
                    <decltype(*this), RHS_T, T, Ts ...>
                        (*this, rhs, name, col_vec_lhs, col_vec_rhs));
    case join_policy::left_right_join:
    default:
        return (column_left_right_join_
                    <decltype(*this), RHS_T, T, Ts ...>
                        (*this, rhs, name, col_vec_lhs, col_vec_rhs));
}

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename IDX_T, typename ... Ts> void DataFrame<I, H>:: join_helpercommon( const LHS_T &lhs, const RHS_T &rhs, const IndexIdxVector &joined_index_idx, DataFrame<IDX_T, HeteroVector<std::size_t(H::align_value)>> &result, const char *skip_col_name) {

const SpinGuard guard(lock_);

// Load the common and lhs columns
for (const auto &iter : lhs.column_list_)  {
    auto    rhs_citer = rhs.column_tb_.find(iter.first);

    if (skip_col_name && iter.first == skip_col_name)  continue;

    // Common column between two frames
    if (rhs_citer != rhs.column_tb_.end())  {
        index_join_functor_common_<decltype(result), Ts ...> functor(
            iter.first.c_str(),
            rhs,
            joined_index_idx,
            result);

        lhs.data_[iter.second].change(functor);
    }
    else  {  // lhs only column
        // 0 = Left
        index_join_functor_oneside_<0, decltype(result), Ts ...> functor (
            iter.first.c_str(),
            joined_index_idx,
            result);

        lhs.data_[iter.second].change(functor);
    }
}

// Load the rhs columns
for (const auto &iter : rhs.column_list_)  {
    auto    lhs_citer = lhs.column_tb_.find(iter.first);

    if (skip_col_name && iter.first == skip_col_name)  continue;

    if (lhs_citer == lhs.column_tb_.end())  {  // rhs only column
        // 1 = Right
        index_join_functor_oneside_<1, decltype(result), Ts ...> functor (
            iter.first.c_str(),
            joined_index_idx,
            result);

        rhs.data_[iter.second].change(functor);
    }
}

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: index_joinhelper(const LHS_T &lhs, const RHS_T &rhs, const IndexIdxVector &joined_index_idx) {

DataFrame<IndexType, HeteroVector<align_value>> result;
StlVecType<IndexType>                           result_index;

// Load the index
result_index.reserve(joined_index_idx.size());
for (auto citer : joined_index_idx)  {
    const size_type left_i = std::get<0>(citer);

    result_index.push_back(
        left_i != std::numeric_limits<size_type>::max()
            ? lhs.indices_[left_i] : rhs.indices_[std::get<1>(citer)]);
}
result.load_index(std::move(result_index));

join_helper_common_<LHS_T, RHS_T, IndexType, Ts ...>
    (lhs, rhs, joined_index_idx, result);
return(result);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: column_joinhelper(const LHS_T &lhs, const RHS_T &rhs, const char *col_name, const IndexIdxVector &joined_index_idx) {

using left_idx_t = typename std::remove_reference<LHS_T>::type::IndexType;
using right_idx_t = typename std::remove_reference<RHS_T>::type::IndexType;

const size_type                                     jii_s =
    joined_index_idx.size();
DataFrame<unsigned int, HeteroVector<align_value>>  result;

// Load the new result index
result.load_index(
    DataFrame<unsigned int, HeteroVector<align_value>>::gen_sequence_index(
        0, static_cast<unsigned int>(jii_s), 1));

// Load the lhs and rhs indices into two columns in the result
// Also load the unified named column
StlVecType<left_idx_t>  lhs_index;
StlVecType<right_idx_t> rhs_index;
StlVecType<T>           named_col_vec;
const ColumnVecType<T>  &lhs_named_col_vec =
    lhs.template get_column<T>(col_name);
const ColumnVecType<T>  &rhs_named_col_vec =
    rhs.template get_column<T>(col_name);

lhs_index.reserve(jii_s);
rhs_index.reserve(jii_s);
named_col_vec.reserve(jii_s);
for (auto citer : joined_index_idx)  {
    const size_type left_i = std::get<0>(citer);
    const size_type right_i = std::get<1>(citer);

    if (left_i != std::numeric_limits<size_type>::max())  {
        lhs_index.push_back(lhs.indices_[left_i]);
        named_col_vec.push_back(lhs_named_col_vec[left_i]);
    }
    else  {
        named_col_vec.push_back(rhs_named_col_vec[right_i]);
        lhs_index.push_back(get_nan<left_idx_t>());
    }
    if (right_i != std::numeric_limits<size_type>::max())
        rhs_index.push_back(rhs.indices_[right_i]);
    else
        rhs_index.push_back(get_nan<right_idx_t>());
}

{
    char            buffer[64];
    const SpinGuard guard(lock_);

    ::snprintf(buffer, sizeof(buffer) - 1, "lhs.%s", DF_INDEX_COL_NAME);
    result.template load_column<left_idx_t>(buffer,
                                            std::move(lhs_index),
                                            nan_policy::pad_with_nans,
                                            false);
    ::snprintf(buffer, sizeof(buffer) - 1, "rhs.%s", DF_INDEX_COL_NAME);
    result.template load_column<right_idx_t>(buffer,
                                             std::move(rhs_index),
                                             nan_policy::pad_with_nans,
                                             false);
    result.template load_column<T>(col_name,
                                   std::move(named_col_vec),
                                   nan_policy::pad_with_nans,
                                   false);
}

join_helper_common_<LHS_T, RHS_T, unsigned int, Ts ...>
    (lhs, rhs, joined_index_idx, result, col_name);
return(result);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template typename DataFrame<I, H>::IndexIdxVector DataFrame<I, H>::get_inner_index_idxvector( const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

size_type       lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type       rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector  joined_index_idx;

joined_index_idx.reserve(std::min(lhs_end, rhs_end));
while (lhs_current != lhs_end && rhs_current != rhs_end) {
    if (*(col_vec_lhs[lhs_current].first) <
            *(col_vec_rhs[rhs_current].first))
        lhs_current += 1;
    else  {
        if (*(col_vec_lhs[lhs_current].first) ==
                *(col_vec_rhs[rhs_current].first))
            joined_index_idx.emplace_back(
                col_vec_lhs[lhs_current++].second,
                col_vec_rhs[rhs_current].second);
       else  //add this row to fix
        rhs_current += 1;
    }
}
return (joined_index_idx);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: index_innerjoin(const LHS_T &lhs, const RHS_T &rhs, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (index_join_helper_<LHS_T, RHS_T, Ts ...>
    (lhs, rhs,
     get_inner_index_idx_vector_<IndexType>(col_vec_lhs, col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: column_innerjoin(const LHS_T &lhs, const RHS_T &rhs, const char *col_name, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
            (lhs, rhs, col_name,
             get_inner_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template typename DataFrame<I, H>::IndexIdxVector DataFrame<I, H>::get_left_index_idxvector( const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

size_type       lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type       rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector  joined_index_idx;

joined_index_idx.reserve(lhs_end);
while (lhs_current != lhs_end || rhs_current != rhs_end) {
    if (lhs_current >= lhs_end)  break;
    if (rhs_current >= rhs_end)  {
        joined_index_idx.emplace_back(
            col_vec_lhs[lhs_current++].second,
            std::numeric_limits<size_type>::max());
        continue;
    }

    if (*(col_vec_lhs[lhs_current].first) <
            *(col_vec_rhs[rhs_current].first))
        joined_index_idx.emplace_back(
            col_vec_lhs[lhs_current++].second,
            std::numeric_limits<size_type>::max());
    else  {
        if (*(col_vec_lhs[lhs_current].first) ==
                *(col_vec_rhs[rhs_current].first))
            joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second,
                                          col_vec_rhs[rhs_current].second);
       else // add this row fix 
         rhs_current += 1;
    }
}
return (joined_index_idx);

} // ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: index_leftjoin(const LHS_T &lhs, const RHS_T &rhs, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (index_join_helper_<LHS_T, RHS_T, Ts ...>
            (lhs, rhs,
             get_left_index_idx_vector_<IndexType>(col_vec_lhs,
                                                   col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: column_leftjoin(const LHS_T &lhs, const RHS_T &rhs, const char *col_name, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
            (lhs, rhs, col_name,
             get_left_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template typename DataFrame<I, H>::IndexIdxVector DataFrame<I, H>::get_right_index_idxvector( const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

size_type       lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type       rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector  joined_index_idx;

joined_index_idx.reserve(rhs_end);
while (lhs_current != lhs_end || rhs_current != rhs_end) {
    if (rhs_current >= rhs_end)  break;
    if (lhs_current >= lhs_end)  {
        joined_index_idx.emplace_back(
            std::numeric_limits<size_type>::max(),
            col_vec_rhs[rhs_current++].second);
        continue;
    }

    if (*(col_vec_lhs[lhs_current].first) <
            *(col_vec_rhs[rhs_current].first))
        lhs_current += 1;
    else  {
        if (*(col_vec_lhs[lhs_current].first) ==
                *(col_vec_rhs[rhs_current].first))
            joined_index_idx.emplace_back(
                col_vec_lhs[lhs_current++].second,
                col_vec_rhs[rhs_current].second);
        else{
            joined_index_idx.emplace_back(
                std::numeric_limits<size_type>::max(),
                col_vec_rhs[rhs_current].second);
        rhs_current += 1;
        }
    }
}
return (joined_index_idx);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: index_rightjoin(const LHS_T &lhs, const RHS_T &rhs, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (index_join_helper_<LHS_T, RHS_T, Ts ...>
            (lhs, rhs,
             get_right_index_idx_vector_<IndexType>(col_vec_lhs,
                                                    col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: column_rightjoin(const LHS_T &lhs, const RHS_T &rhs, const char *col_name, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
            (lhs, rhs, col_name,
             get_right_index_idx_vector_<T>(col_vec_lhs, col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template typename DataFrame<I, H>::IndexIdxVector DataFrame<I, H>::get_left_right_index_idxvector( const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

size_type       lhs_current = 0;
const size_type lhs_end = col_vec_lhs.size();
size_type       rhs_current = 0;
const size_type rhs_end = col_vec_rhs.size();
IndexIdxVector  joined_index_idx;

joined_index_idx.reserve(std::max(lhs_end, rhs_end));
while (lhs_current != lhs_end || rhs_current != rhs_end) {
    if (lhs_current >= lhs_end && rhs_current < rhs_end)  {
        joined_index_idx.emplace_back(
            std::numeric_limits<size_type>::max(),
            col_vec_rhs[rhs_current++].second);
        continue;
    }
    if (rhs_current >= rhs_end && lhs_current < lhs_end)  {
        joined_index_idx.emplace_back(
            col_vec_lhs[lhs_current++].second,
            std::numeric_limits<size_type>::max());
        continue;
    }

    if (*(col_vec_lhs[lhs_current].first) <
            *(col_vec_rhs[rhs_current].first))  {
        joined_index_idx.emplace_back(
            col_vec_lhs[lhs_current++].second,
            std::numeric_limits<size_type>::max());
    }
    else  {
        if (*(col_vec_lhs[lhs_current].first) ==
                *(col_vec_rhs[rhs_current].first))
            joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second,
                                          col_vec_rhs[rhs_current].second);
        else
        {  //add this row to fix
            joined_index_idx.emplace_back(
                std::numeric_limits<size_type>::max(),
                col_vec_rhs[rhs_current].second);
         rhs_current += 1;
        }  //add this row to fix
    }
}
return (joined_index_idx);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: index_left_rightjoin( const LHS_T &lhs, const RHS_T &rhs, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (index_join_helper_<LHS_T, RHS_T, Ts ...>
            (lhs, rhs,
             get_left_right_index_idx_vector_<IndexType>(col_vec_lhs,
                                                         col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename T, typename ... Ts> DataFrame<unsigned int, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>:: column_left_rightjoin(const LHS_T &lhs, const RHS_T &rhs, const char *col_name, const StlVecType<JoinSortingPair> &col_vec_lhs, const StlVecType<JoinSortingPair> &col_vec_rhs) {

return (column_join_helper_<LHS_T, RHS_T, T, Ts ...>
            (lhs, rhs, col_name,
             get_left_right_index_idx_vector_<T>(col_vec_lhs,
                                                 col_vec_rhs)));

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename LHS_T, typename RHS_T, typename ... Ts> void DataFrame<I, H>:: concathelper(LHS_T &lhs, const RHS_T &rhs, bool add_new_columns) {

const size_type orig_index_s = lhs.get_index().size();

lhs.get_index().insert(lhs.get_index().end(),
                       rhs.get_index().begin(), rhs.get_index().end());

// Load common columns
for (const auto &lhs_iter : lhs.column_list_)  {
    auto    rhs_citer = rhs.column_tb_.find(lhs_iter.first);

    if (rhs_citer != rhs.column_tb_.end())  {
        concat_functor_<LHS_T, Ts ...>  functor(lhs_iter.first.c_str(),
                                                lhs,
                                                false,
                                                orig_index_s);

        rhs.data_[rhs_citer->second].change(functor);
    }
}

// Load columns from rhs that do not exist in lhs
if (add_new_columns)  {
    for (const auto &rhs_citer : rhs.column_list_)  {
        auto    lhs_iter = lhs.column_tb_.find(rhs_citer.first);

        if (lhs_iter == lhs.column_tb_.end())  {
            concat_functor_<LHS_T, Ts ...>  functor(rhs_citer.first.c_str(),
                                                    lhs,
                                                    true,
                                                    orig_index_s);

            rhs.data_[rhs_citer.second].change(functor);
        }
    }
}

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename RHS_T, typename ... Ts> void DataFrame<I, H>::self_concat(const RHS_T &rhs, bool add_new_columns) {

static_assert(
    (std::is_base_of<
         DataFrame<I,
                   HeteroVector<std::size_t(H::align_value)>>,
                   RHS_T>::value ||
     std::is_base_of<View, RHS_T>::value ||
     std::is_base_of<PtrView, RHS_T>::value) &&
    ! std::is_base_of<DataFrame<I,
                                HeteroVector<std::size_t(H::align_value)>>,
                      decltype(*this)>::value,
    "The rhs argument to self_concat() can only be "
    "StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>. "
    "Self must be StdDataFrame<IndexType>");

const SpinGuard guard(lock_);

concat_helper_<decltype(*this), RHS_T, Ts ...>(*this, rhs, add_new_columns);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename RHS_T, typename ... Ts> DataFrame<I, H> DataFrame<I, H>::concat(const RHS_T &rhs, concat_policy cp) const {

static_assert(
    (std::is_base_of<
         DataFrame<I,
                   HeteroVector<std::size_t(H::align_value)>>,
                   RHS_T>::value ||
     std::is_base_of<View, RHS_T>::value ||
     std::is_base_of<PtrView, RHS_T>::value) &&
    ! std::is_base_of<DataFrame<I,
                                HeteroVector<std::size_t(H::align_value)>>,
                      decltype(*this)>::value,
    "The rhs argument to concat() can only be "
    "StdDataFrame<IndexType> or DataFrame[Ptr]View<IndexType>. "
    "Self must be StdDataFrame<IndexType>");

DataFrame<I, HeteroVector<align_value>> result;
const SpinGuard                         guard(lock_);

if (cp == concat_policy::all_columns ||
    cp == concat_policy::lhs_and_common_columns)  {
    result = *this;
    concat_helper_<decltype(result), RHS_T, Ts ...>(
        result, rhs, cp == concat_policy::all_columns);
}
else if (cp == concat_policy::common_columns)  {
    result.load_index(this->get_index().begin(), this->get_index().end());

    for (const auto &lhs_citer : column_list_)  {
        auto    rhs_citer = rhs.column_tb_.find(lhs_citer.first);

        if (rhs_citer != rhs.column_tb_.end())  {
            load_all_functor_<Ts ...>   functor(lhs_citer.first.c_str(),
                                                result);

            data_[lhs_citer.second].change(functor);
        }
    }
    concat_helper_<decltype(result), RHS_T, Ts ...>(result, rhs, false);
}

return (result);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename RHS_T, typename ... Ts> typename DataFrame<I, H>::PtrView DataFrame<I, H>::concat_view(RHS_T &rhs, concat_policy cp) {

static_assert(
    ! std::is_base_of<
          DataFrame<I,
                    HeteroVector<std::size_t(H::align_value)>>,
                    RHS_T>::value ||
    ! std::is_base_of<DataFrame<I,
                                HeteroVector<std::size_t(H::align_value)>>,
                      decltype(*this)>::value,
    "Currently, arguments to concat_view() can only be "
    "StdDataFrame<IndexType>.");

PtrView result;

using idxvec_t = typename PtrView::IndexVecType;

const size_type idx_s = get_index().size();
const size_type rhs_idx_s = rhs.get_index().size();
idxvec_t        result_idx;

result_idx.reserve(idx_s + rhs_idx_s);
for (size_type i = 0; i < idx_s; ++i)
    result_idx.push_back(&(get_index()[i]));
for (size_type i = 0; i < rhs_idx_s; ++i)
    result_idx.push_back(&(rhs.get_index()[i]));
result.indices_ = std::move(result_idx);

if (cp == concat_policy::all_columns)  {
    for (const auto &lhs_citer : column_list_)  {
        concat_load_view_functor_<PtrView, Ts ...> functor(
            lhs_citer.first.c_str(), result);

        data_[lhs_citer.second].change(functor);
    }
    for (const auto &rhs_citer : rhs.column_list_)  {
        concat_load_view_functor_<PtrView, Ts ...> functor(
            rhs_citer.first.c_str(), result);

        rhs.data_[rhs_citer.second].change(functor);
    }
}
else if (cp == concat_policy::lhs_and_common_columns)  {
    for (const auto &lhs_citer : column_list_)  {
        concat_load_view_functor_<PtrView, Ts ...> functor(
            lhs_citer.first.c_str(), result);

        data_[lhs_citer.second].change(functor);

        auto    rhs_citer = rhs.column_tb_.find(lhs_citer.first);

        if (rhs_citer != rhs.column_tb_.end())
            rhs.data_[rhs_citer->second].change(functor);
    }
}
else if (cp == concat_policy::common_columns)  {
    for (const auto &lhs_citer : column_list_)  {
        concat_load_view_functor_<PtrView, Ts ...> functor(
            lhs_citer.first.c_str(), result);
        auto                                       rhs_citer =
            rhs.column_tb_.find(lhs_citer.first);

        if (rhs_citer != rhs.column_tb_.end())  {
            data_[lhs_citer.second].change(functor);
            rhs.data_[rhs_citer->second].change(functor);
        }
    }

}

return (result);

}

// ----------------------------------------------------------------------------

template<typename I, typename H> template<typename RHS_T, typename ... Ts> typename DataFrame<I, H>::ConstPtrView DataFrame<I, H>::concat_view(RHS_T &rhs, concat_policy cp) const {

static_assert(
    ! std::is_base_of<
          DataFrame<I,
                    HeteroVector<std::size_t(H::align_value)>>,
                    RHS_T>::value ||
    ! std::is_base_of<DataFrame<I,
                                HeteroVector<std::size_t(H::align_value)>>,
                      decltype(*this)>::value,
    "Currently, arguments to concat_view() can only be "
    "StdDataFrame<IndexType>.");

ConstPtrView    result;

using idxvec_t = typename ConstPtrView::IndexVecType;

const size_type idx_s = get_index().size();
const size_type rhs_idx_s = rhs.get_index().size();
idxvec_t        result_idx;

result_idx.reserve(idx_s + rhs_idx_s);
for (size_type i = 0; i < idx_s; ++i)
    result_idx.push_back(&(get_index()[i]));
for (size_type i = 0; i < rhs_idx_s; ++i)
    result_idx.push_back(&(rhs.get_index()[i]));
result.indices_ = std::move(result_idx);

if (cp == concat_policy::all_columns)  {
    for (const auto &lhs_citer : column_list_)  {
        concat_load_view_functor_<ConstPtrView, Ts ...> functor(
            lhs_citer.first.c_str(), result);

        data_[lhs_citer.second].change(functor);
    }
    for (const auto &rhs_citer : rhs.column_list_)  {
        concat_load_view_functor_<ConstPtrView, Ts ...> functor(
            rhs_citer.first.c_str(), result);

        rhs.data_[rhs_citer.second].change(functor);
    }
}
else if (cp == concat_policy::lhs_and_common_columns)  {
    for (const auto &lhs_citer : column_list_)  {
        concat_load_view_functor_<ConstPtrView, Ts ...> functor(
            lhs_citer.first.c_str(), result);

        data_[lhs_citer.second].change(functor);

        auto    rhs_citer = rhs.column_tb_.find(lhs_citer.first);

        if (rhs_citer != rhs.column_tb_.end())
            rhs.data_[rhs_citer->second].change(functor);
    }
}
else if (cp == concat_policy::common_columns)  {
    for (const auto &lhs_citer : column_list_)  {
        concat_load_view_functor_<ConstPtrView, Ts ...> functor(
            lhs_citer.first.c_str(), result);
        auto                                                   rhs_citer =
            rhs.column_tb_.find(lhs_citer.first);

        if (rhs_citer != rhs.column_tb_.end())  {
            data_[lhs_citer.second].change(functor);
            rhs.data_[rhs_citer->second].change(functor);
        }
    }

}

return (result);

}

} // namespace hmdf

// ----------------------------------------------------------------------------

// Local Variables: // mode:C++ // tab-width:4 // c-basic-offset:4 // End:

hosseinmoein commented 1 year ago

Thanks you for looking into this. Why don't you submit this as a PR (pull request)? That way it goes through all the testes and you get the credit for it, if you care about that

hehuaijin commented 1 year ago

您好,您的邮件已收到,我会尽快查看并回复您。谢谢!

hehuaijin commented 1 year ago

Tnanks you for replay!  I will try learning submit this as a PR!   

阿黑 @.***

 

------------------ 原始邮件 ------------------ 发件人: "Hossein @.>; 发送时间: 2023年4月27日(星期四) 晚上9:30 收件人: @.>; 抄送: @.>; @.>; 主题: Re: [hosseinmoein/DataFrame] dataframe_join.tcc miss { } (Issue #238)

Thanks you for looking into this. Why don't you submit this as a PR (pull request)? That way it goes through all the testes and you get the credit for it, if you care about that

— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you authored the thread.Message ID: @.***>

hosseinmoein commented 1 year ago

I looked at this further. I don't think this is a bug. It should work properly. But I could be wrong. Can you show me through an example this is a bug?

Thanks

hehuaijin commented 1 year ago

Say you're sorry,My apologies for the late reply.

This is a sample:

/***/

include <DataFrame/DataFrame.h> // Main DataFrame header

include <DataFrame/DataFrameFinancialVisitors.h> // Financial algorithms

include <DataFrame/DataFrameMLVisitors.h> // Machine-learning algorithms

include <DataFrame/DataFrameStatsVisitors.h> // Statistical algorithms

include <DataFrame/Utils/DateTime.h> // Cool and handy date-time object

using namespace hmdf;

// A DataFrame with ulong index type // using ULDataFrame = StdDataFrame;

// A DataFrame with string index type // using StrDataFrame = StdDataFrame;

// A DataFrame with DateTime index type // using DTDataFrame = StdDataFrame;

void test_index_left_join() {

using MyDataFrame = ULDataFrame;
std::cout << "\nTesting Index Left Join ..." << std::endl;

std::vector<unsigned long>  idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
std::vector<double> djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 };
std::vector<double> d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
std::vector<double> d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
std::vector<double> d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
std::vector<int>    i1 = { 22, 23, 24, 25, 99 };
MyDataFrame         df;

df.load_data(std::move(idx),
    std::make_pair("djoincol", djoincol1),
    std::make_pair("col_1", d1),
    std::make_pair("col_2", d2),
    std::make_pair("col_3", d3),
    std::make_pair("col_4", i1));

std::vector<unsigned long>  idx2 =
{1 };
std::vector<double> djoincol2 = { 1 };
std::vector<double> d12 = { 11 };
std::vector<double> d22 = { 18 };
std::vector<double> d32 = { 115 };
std::vector<int>    i12 = { 122 };
MyDataFrame         df2;

df2.load_data(std::move(idx2),
    std::make_pair("djoincol", djoincol2),
    std::make_pair("xcol_1", d12),
    std::make_pair("col_2", d22),
    std::make_pair("xcol_3", d32),
    std::make_pair("col_4", i12));

std::cout << "First DF:" << std::endl;
df.write<std::ostream, double, int>(std::cout);
std::cout << "Second DF2:" << std::endl;
df2.write<std::ostream, double, int>(std::cout);

auto join_df =
    df.join_by_column<decltype(df2), double, int>(df2,"djoincol", hmdf::join_policy::left_join);

std::cout << "Now The joined DF:" << std::endl;
join_df.write<std::ostream, double, int>(std::cout);

}

write out:

Testing Index Left Join ... First DF: INDEX:14::123450,123451,123452,123453,123454,123455,123456,123457,123458,123459,123460,123461,123462,123466, djoincol:14::1,1,1,1,1,1,1,1,1,1,1,1,1,14, col_1:14::1,2,3,4,5,6,7,8,9,10,11,12,13,14, col_2:14::8,9,10,11,12,13,14,20,22,23,30,31,32,1.89, col_3:14::15,16,15,18,19,16,21,0.34,1.56,0.34,2.3,0.34,19,nan, col_4:14::22,23,24,25,99,0,0,0,0,0,0,0,0,0,

Second DF2: INDEX:1::1, djoincol:1::1, xcol_1:1::11, col_2:1::18, xcol_3:1::115, col_4:1::122,

Now The joined DF: INDEX:14::0,1,2,3,4,5,6,7,8,9,10,11,12,13, djoincol:14::1,1,1,1,1,1,1,1,1,1,1,1,1,14, lhs.col_4:14::22,23,24,25,99,0,0,0,0,0,0,0,0,0, rhs.col_4:14::122,0,0,0,0,0,0,0,0,0,0,0,0,0,

Should be: djoincol:14::1,1,1,1,1,1,1,1,1,1,1,1,1,14, lhs.col_4:14::22,23,24,25,99,0,0,0,0,0,0,0,0,0, rhs.col_4:14::122,122,122,122,122,122,122,122,122,122,122,122,122,nan,

hosseinmoein commented 1 year ago

I am a bit confused. The code that your are saying is wrong and you corrected (line# 577 in file DataFrame_join.tcc) is for the left right join (aka merge). The code example above is for left outer join. In other words, the example above doesn't execute the code you corrected.

hehuaijin commented 1 year ago

The similar bug more than one. I post the fix code, and comments by : "//add this row to fix".

hosseinmoein commented 1 year ago

Can you please submit a pull request?

I am still confused. Your comments //add this row to fix are only in two functions

get_inner_index_idx_vector_()
get_left_right_index_idx_vector_()

Neither of these functions is executed in the code sample you posted in test_index_left_join()

hehuaijin commented 1 year ago

get_left_right_index_idxvector:

while (lhs_current != lhs_end || rhs_current != rhs_end) { ....... if ((col_vec_lhs[lhs_current].first) < (col_vec_rhs[rhs_current].first)) { joined_index_idx.emplace_back( col_vec_lhs[lhs_current++].second, std::numeric_limits::max()); } else { if ((col_vec_lhs[lhs_current].first) == (col_vec_rhs[rhs_current].first)) joined_index_idx.emplace_back(col_vec_lhs[lhs_current++].second, // **here ,left record move to next**** col_vec_rhs[rhs_current].second); else { //add this row to fix
joined_index_idx.emplace_back( std::numeric_limits::max(), col_vec_rhs[rhs_current].second);

//** if no “{” "}", next line will run every time. if next left record eq rhs_current,and rhs_current will move to next. *****

         rhs_current += 1;    
        }  //add this row to fix
    }
}
hosseinmoein commented 1 year ago

Thank you for looking into this. But I believe the original behavior/code is correct. making your changes will introduce the bug of repeating the RHS values where they shouldn't be there

Also your code sample above is missing some type specifications. The correct code is:

static void test_index_left_join()  {

    using MyDataFrame = ULDataFrame;
    std::cout << "\nTesting Index Left Join ..." << std::endl;

    std::vector<unsigned long>   idx =
        { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    std::vector<double>          djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 };
    std::vector<double>          d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    std::vector<double>          d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
    std::vector<double>          d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
    std::vector<int>                  i1 = { 22, 23, 24, 25, 99 };
    MyDataFrame                     df;

    df.load_data(std::move(idx),
                 std::make_pair("djoincol", djoincol1),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2),
                 std::make_pair("col_3", d3),
                 std::make_pair("col_4", i1));

    std::vector<unsigned long>   idx2 = {1 };
    std::vector<double>               djoincol2 = { 1 };
    std::vector<double>               d12 = { 11 };
    std::vector<double>               d22 = { 18 };
    std::vector<double>               d32 = { 115 };
    std::vector<int>                       i12 = { 122 };
    MyDataFrame                          df2;

    df2.load_data(std::move(idx2),
                  std::make_pair("djoincol", djoincol2),
                  std::make_pair("xcol_1", d12),
                  std::make_pair("col_2", d22),
                  std::make_pair("xcol_3", d32),
                  std::make_pair("col_4", i12));

    std::cout << "First DF:" << std::endl;
    df.write<std::ostream, double, int>(std::cout, io_format::csv2);
    std::cout << "Second DF2:" << std::endl;
    df2.write<std::ostream, double, int>(std::cout, io_format::csv2);

    auto join_df =
        df.join_by_column<decltype(df2), double, double, int>(df2, "djoincol", hmdf::join_policy::left_join);

    std::cout << "Now The joined DF:" << std::endl;
    join_df.write<std::ostream, double, int, unsigned long>(std::cout, io_format::csv2);
}
hehuaijin commented 1 year ago

/*left table:*****/ std::vector djoincol1 = { 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 14 }; std::vector d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };

/*right table:*****/ djoincol1 = { 1} std::vector d2 = {2} /*right table:*****/ join on left.djoincol1 =right.djoincol1 I think result should be: d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; d2 = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, nan };

hosseinmoein commented 1 year ago

I believe the left join result should be

d2 = { 2, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan };

The rhs has only one 2 value and if you repeat them it means you are manufacturing data you can replicate this in a relational database like Postgres