Patch Status 35984-v2_load_and_storelanes_with_SLP-1

github-actions[bot] commented 4 months ago

Precommit CI Run information

Logs can be found in the associated Github Actions run: https://github.com/ewlu/gcc-precommit-ci/actions/runs/9839079371

Patch information

Applied patches: 1 -> 1 Associated series: https://patchwork.sourceware.org/project/gcc/list/?series=35984 Last patch applied: https://patchwork.sourceware.org/project/gcc/patch/20240708113356.A60DF3861C69@sourceware.org/ Patch id: 93496

Build Targets

Some targets are built as multilibs. If a build target ends with `multilib`, please refer to the table below to see all the targets within that multilib.	Target name	`-march` string
newlib-rv64gc-lp64d-multilib	`rv32gc-ilp32d`, `rv64gc-lp64d`
newlib-rv64gcv-lp64d-multilib	`rv64gcv-lp64d`
linux-rv64gcv-lp64d-multilib	`rv32gcv-ilp32d`, `rv64gcv-lp64d`

Target Information

Target Shorthand	`-march` string
Bitmanip	`gc_zba_zbb_zbc_zbs`

Notes

Testsuite results use a more lenient allowlist to reduce error reporting with flakey tests. Please take a look at the current allowlist. Results come from a sum file comparator. Each patch is applied to a well known, non-broken baseline taken from our gcc postcommit framework (here) which runs the full gcc testsuite every 6 hours. If you have any questions or encounter any issues which may seem like false-positives, please contact us at patchworks-ci@rivosinc.com

github-actions[bot] commented 4 months ago

Lint Status

The following issues have been found with 35984-v2_load_and_storelanes_with_SLP-1 using gcc's ./contrib/check_GNU_style.py. Please use your best judgement when resolving these issues. These are only warnings and do not need to be resolved in order to merge your patch.

=== ERROR type #1: dot, space, space, end of comment (1 error(s)) ===
gcc/tree-vect-slp.cc:3617:38:                        stmt_info, group_size, 1 /*i*/);

=== ERROR type #2: lines should not exceed 80 characters (5 error(s)) ===
gcc/tree-vect-slp.cc:9858:80:                  new_stmt = gimple_build_assign (vec, build1 (VIEW_CONVERT_EXPR, vectype, gimple_assign_lhs (new_stmt)));
gcc/tree-vect-stmts.cc:2077:80:                            ? vect_load_lanes_supported (vectype, group_size, masked_p)
gcc/tree-vect-stmts.cc:2079:80:                                                          masked_p))) != IFN_LAST)
gcc/tree-vect-stmts.cc:8814:80:                        vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
gcc/tree-vect-stmts.cc:8858:80:              vec_array = TREE_OPERAND (gimple_assign_lhs (SSA_NAME_DEF_STMT (def)), 0);

Additional information

github-actions[bot] commented 4 months ago

Apply Status

Target	Status
Baseline hash: https://github.com/gcc-mirror/gcc/commit/53eef7915ac1a9b9ad672cc2e80509c15a764ca2	Failed
Tip of tree hash: https://github.com/gcc-mirror/gcc/commit/5be97039aa6c27fdf5d5bd43ef393b307c5ecedd	Failed

Command

> git am ../patches/*.patch --whitespace=fix -q --3way --empty=drop

Output

error: sha1 information is lacking or useless (gcc/tree-vect-slp.cc).
error: could not build fake ancestor
hint: Use 'git am --show-current-patch=diff' to see the failed patch
hint: When you have resolved this problem, run "git am --continue".
hint: If you prefer to skip this patch, run "git am --skip" instead.
hint: To restore the original branch and stop patching, run "git am --abort".
hint: Disable this message with "git config advice.mergeConflict false"
Patch failed at 0001 load and store-lanes with SLP
---
 gcc/tree-vect-slp.cc   | 147 +++++++++++++++++++++++++++++++++++++++--
 gcc/tree-vect-stmts.cc |  68 ++++++++++++++-----
 gcc/tree-vectorizer.h  |   3 +
 3 files changed, 197 insertions(+), 21 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 2dc6d365303..a5648cacd70 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -120,6 +120,7 @@ _slp_tree::_slp_tree ()
   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
   SLP_TREE_CODE (this) = ERROR_MARK;
+  this->ldst_lanes = false;
   SLP_TREE_VECTYPE (this) = NULL_TREE;
   SLP_TREE_REPRESENTATIVE (this) = NULL;
   SLP_TREE_REF_COUNT (this) = 1;
@@ -3601,9 +3602,22 @@ vect_build_slp_instance (vec_info *vinfo,
     size >= 1.  */
       else if (is_a <loop_vec_info> (vinfo)
           && (i > 0 && i < group_size)
-          && !vect_slp_prefer_store_lanes_p (vinfo,
-                         stmt_info, group_size, i))
-   {
+          /*&& !vect_slp_prefer_store_lanes_p (vinfo,
+                         stmt_info, group_size, i)*/)
+   {
+     /* There are targets that cannot do even/odd interleaving schemes
+        so they absolutely need to use load/store-lanes.  For now
+        force single-lane SLP for them - they would be happy with
+        uniform power-of-two lanes (but depending on element size),
+        but even if we can use 'i' as indicator we would need to
+        backtrack when later lanes fail to discover with the same
+        granularity.  */
+     bool want_store_lanes
+       = vect_slp_prefer_store_lanes_p (vinfo,
+                        stmt_info, group_size, 1 /*i*/);
+     if (want_store_lanes)
+       i = 1;
+
      if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                 "Splitting SLP group at stmt %u\n", i);
@@ -3637,7 +3651,10 @@ vect_build_slp_instance (vec_info *vinfo,
                           (max_nunits, end - start));
          rhs_nodes.safe_push (node);
          start = end;
-         end = group_size;
+         if (want_store_lanes)
+           end = start + 1;
+         else
+           end = group_size;
        }
          else
        {
@@ -3711,7 +3728,14 @@ vect_build_slp_instance (vec_info *vinfo,
         we end up with two inputs.  That scheme makes sure we end
         up with permutes satisfying the restriction of requiring at
         most two vector inputs to produce a single vector output
-        when the number of lanes is even.  */
+        when the number of lanes is even.
+        Leave the merge as is when we want to use store-lanes.  */
+         if (want_store_lanes)
+       {
+         perm->ldst_lanes = 1;
+         node->ldst_lanes = 1;
+       }
+         else
          while (SLP_TREE_CHILDREN (perm).length () > 2)
        {
          /* When we have three equal sized groups left the pairwise
@@ -4057,6 +4081,36 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
   if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
     return;

+  /* Verify if all load permutations can be implemented with a suitably
+     large element load-lanes operation.  */
+  unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
+  if (exact_log2 (ld_lanes_lanes) == -1
+      /* ???  Need to handle ld_lanes_lanes != 1 correctly.  */
+      || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
+                    group_lanes, false) == IFN_LAST)
+    ld_lanes_lanes = 0;
+  else
+    for (slp_tree load : loads)
+      {
+   if (SLP_TREE_LANES (load) != ld_lanes_lanes)
+     {
+       ld_lanes_lanes = 0;
+       break;
+     }
+   unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
+   if (first % ld_lanes_lanes != 0)
+     {
+       ld_lanes_lanes = 0;
+       break;
+     }
+   for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
+     if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
+       {
+         ld_lanes_lanes = 0;
+         break;
+       }
+      }
+
   for (slp_tree load : loads)
     {
       /* Leave masked or gather loads alone for now.  */
@@ -4071,7 +4125,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
     with a non-1:1 load permutation around instead of canonicalizing
     those into a load and a permute node.  Removing this early
     check would do such canonicalization.  */
-      if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+      if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
+     && ld_lanes_lanes == 0)
    continue;

       /* First build (and possibly re-use) a load node for the
@@ -4104,6 +4159,12 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
    final_perm.quick_push
      (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));

+      if (ld_lanes_lanes != 0)
+   {
+     l0->ldst_lanes = true;
+     load->ldst_lanes = true;
+   }
+      else
       while (1)
    {
      unsigned group_lanes = SLP_TREE_LANES (l0);
@@ -9705,6 +9766,8 @@ vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   node->push_vec_def (perm_stmt);
 }

+extern tree create_vector_array (tree elem_type, unsigned HOST_WIDE_INT nelems);
+
 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
    If GSI is nonnull, emit the permutation there.
@@ -9758,6 +9821,78 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,

   gcc_assert (perm.length () == SLP_TREE_LANES (node));

+  /* Load/store-lanes permute.  */
+  if (node->ldst_lanes)
+    {
+      if (!gsi)
+   /* ???  We should have this flag set on the permute before a store
+      or the permutes after a load.  Check vect_load_lanes_supported
+      or leave that to the store?  */
+   return 1;
+      /* For the load case there should be a single child and
+    it's vector def should be the virtual SSA def of the
+    .LOAD_LANES call.  Create the vect__5.7_48 = vect_array.6[n]
+    load here plus punning of the vector type (in case an element
+    represents multiple lanes).  */
+      if (children.length () == 1)
+   {
+     slp_tree child = children[0];
+     unsigned vec_num = (SLP_TREE_LANE_PERMUTATION (node)[0].second
+                 / SLP_TREE_LANES (node));
+     for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
+       {
+         tree def = SLP_TREE_VEC_DEFS (child)[i];
+         gcc_assert (virtual_operand_p (def));
+         tree decl = gimple_call_lhs (SSA_NAME_DEF_STMT (def));
+         tree vec = make_ssa_name (TREE_TYPE (TREE_TYPE (decl)));
+         gassign *new_stmt
+       = gimple_build_assign (vec,
+                      build4 (ARRAY_REF, TREE_TYPE (vec),
+                          decl, build_int_cstu
+                        (unsigned_type_node, vec_num),
+                          NULL_TREE, NULL_TREE));
+         vect_finish_stmt_generation (vinfo, NULL, new_stmt, gsi);
+         if (!useless_type_conversion_p (vectype, TREE_TYPE (vec)))
+       {
+         vec = make_ssa_name (vectype);
+         new_stmt = gimple_build_assign (vec, build1 (VIEW_CONVERT_EXPR, vectype, gimple_assign_lhs (new_stmt)));
+         vect_finish_stmt_generation (vinfo, NULL, new_stmt, gsi);
+       }
+         node->push_vec_def (vec);
+       }
+     return SLP_TREE_VEC_DEFS (node).length ();
+   }
+      /* For the store case create the array and the set of stores from
+    all children vector defs.  The vector def should then be the
+    virtual def of the last store into the array.  */
+      else
+   {
+     unsigned vec_num = SLP_TREE_VEC_DEFS (children[0]).length ();
+     for (unsigned i = 0; i < vec_num; ++i)
+       {
+         tree decl = create_vector_array (op_vectype,
+                          SLP_TREE_LANES (node)
+                          / SLP_TREE_LANES (children[0]));
+         tree last_vdef = NULL_TREE;
+         for (unsigned j = 0; j < children.length (); ++j)
+       {
+         slp_tree child = children[j];
+         tree ref = build4 (ARRAY_REF, op_vectype, decl,
+                    build_int_cstu (unsigned_type_node, j),
+                    NULL_TREE, NULL_TREE);
+         gimple *new_stmt
+           = gimple_build_assign (ref, SLP_TREE_VEC_DEFS (child)[i]);
+         vect_finish_stmt_generation (vinfo, NULL, new_stmt, gsi);
+         last_vdef = make_ssa_name (gimple_vop (cfun));
+         SSA_NAME_DEF_STMT (last_vdef) = new_stmt;
+         gimple_set_vdef (new_stmt, last_vdef);
+       }
+         node->push_vec_def (last_vdef);
+       }
+     return vec_num;
+   }
+    }
+
   /* REPEATING_P is true if every output vector is guaranteed to use the
      same permute vector.  We can handle that case for both variable-length
      and constant-length vectors, but we only handle other cases for
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index fdcda0d2aba..7169ad4f9c9 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -146,7 +146,7 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,

 /* Return a variable of type ELEM_TYPE[NELEMS].  */

-static tree
+tree
 create_vector_array (tree elem_type, unsigned HOST_WIDE_INT nelems)
 {
   return create_tmp_var (build_array_type_nelts (elem_type, nelems),
@@ -1508,7 +1508,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,

   unsigned int nvectors;
   if (slp_node)
-    nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+    /* ???  Incorrect for multi-lane lanes.  */
+    nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
   else
     nvectors = vect_get_num_copies (loop_vinfo, vectype);

@@ -2069,6 +2070,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
         is irrelevant for them.  */
          *alignment_support_scheme = dr_unaligned_supported;
        }
+     /* Try using LOAD/STORE_LANES.  */
+     else if (slp_node->ldst_lanes
+          && (*lanes_ifn
+            = (vls_type == VLS_LOAD
+               ? vect_load_lanes_supported (vectype, group_size, masked_p)
+               : vect_store_lanes_supported (vectype, group_size,
+                             masked_p))) != IFN_LAST)
+       *memory_access_type = VMAT_LOAD_STORE_LANES;
      else
        *memory_access_type = VMAT_CONTIGUOUS;

@@ -8762,7 +8771,7 @@ vectorizable_store (vec_info *vinfo,

   if (memory_access_type == VMAT_LOAD_STORE_LANES)
     {
-      gcc_assert (!slp && grouped_store);
+      //gcc_assert (!slp && grouped_store);
       unsigned inside_cost = 0, prologue_cost = 0;
       /* For costing some adjacent vector stores, we'd like to cost with
     the total number of them once instead of cost each one by one. */
@@ -8784,7 +8793,7 @@ vectorizable_store (vec_info *vinfo,
          op = vect_get_store_rhs (next_stmt_info);
          if (costing_p)
            update_prologue_cost (&prologue_cost, op);
-         else
+         else if (!slp)
            {
              vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
                             ncopies, op,
@@ -8799,15 +8808,15 @@ vectorizable_store (vec_info *vinfo,
        {
          if (mask)
            {
-             vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
-                            mask, &vec_masks,
-                            mask_vectype);
+             if (slp_node)
+           vect_get_slp_defs (mask_node, &vec_masks);
+             else
+           vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
+                              mask, &vec_masks,
+                              mask_vectype);
              vec_mask = vec_masks[0];
            }

-         /* We should have catched mismatched types earlier.  */
-         gcc_assert (
-           useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
          dataref_ptr
            = vect_create_data_ref_ptr (vinfo, first_stmt_info,
                        aggr_type, NULL, offset, &dummy,
@@ -8819,11 +8828,17 @@ vectorizable_store (vec_info *vinfo,
          gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
          /* DR_CHAIN is then used as an input to
         vect_permute_store_chain().  */
+         if (!slp)
+       {
+         /* We should have catched mismatched types earlier.  */
+         gcc_assert (
+           useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
          for (i = 0; i < group_size; i++)
        {
          vec_oprnd = (*gvec_oprnds[i])[j];
          dr_chain[i] = vec_oprnd;
        }
+       }
          if (mask)
        vec_mask = vec_masks[j];
          dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
@@ -8836,8 +8851,16 @@ vectorizable_store (vec_info *vinfo,
          continue;
        }

+     tree vec_array;
+     if (slp)
+       {
+         tree def = SLP_TREE_VEC_DEFS (SLP_TREE_CHILDREN (slp_node)[0])[j];
+         vec_array = TREE_OPERAND (gimple_assign_lhs (SSA_NAME_DEF_STMT (def)), 0);
+       }
+     else
+       {
      /* Get an array into which we can store the individual vectors.  */
-     tree vec_array = create_vector_array (vectype, vec_num);
+     vec_array = create_vector_array (vectype, vec_num);

      /* Invalidate the current contents of VEC_ARRAY.  This should
         become an RTL clobber too, which prevents the vector registers
@@ -8851,6 +8874,7 @@ vectorizable_store (vec_info *vinfo,
          write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
                  i);
        }
+       }

      tree final_mask = NULL;
      tree final_len = NULL;
@@ -8917,9 +8941,10 @@ vectorizable_store (vec_info *vinfo,

      /* Record that VEC_ARRAY is now dead.  */
      vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
-     if (j == 0)
+     if (j == 0 && !slp)
        *vec_stmt = new_stmt;
-     STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+     if (!slp)
+       STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
    }

       if (costing_p)
@@ -10765,12 +10790,14 @@ vectorizable_load (vec_info *vinfo,
     {
       gcc_assert (alignment_support_scheme == dr_aligned
          || alignment_support_scheme == dr_unaligned_supported);
-      gcc_assert (grouped_load && !slp);
+      //gcc_assert (grouped_load && !slp);

       unsigned int inside_cost = 0, prologue_cost = 0;
       /* For costing some adjacent vector loads, we'd like to cost with
     the total number of them once instead of cost each one by one. */
       unsigned int n_adjacent_loads = 0;
+      if (slp_node)
+   ncopies = slp_node->vec_stmts_size / vec_num;
       for (j = 0; j < ncopies; j++)
    {
      if (costing_p)
@@ -10884,6 +10911,15 @@ vectorizable_load (vec_info *vinfo,
      gimple_call_set_nothrow (call, true);
      vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);

+     if (slp)
+       {
+         tree def = copy_ssa_name (gimple_vuse (call));
+         gimple_set_vdef (call, def);
+         SSA_NAME_DEF_STMT (def) = call;
+         slp_node->push_vec_def (def);
+       }
+     else
+       {
      dr_chain.create (vec_num);
      /* Extract each vector into an SSA_NAME.  */
      for (i = 0; i < vec_num; i++)
@@ -10900,8 +10936,10 @@ vectorizable_load (vec_info *vinfo,
      vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);

      dr_chain.release ();
+       }

-     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+     if (!slp_node)
+       *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
    }

       if (costing_p)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 8eb3ec4df86..ac288541c51 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -222,6 +222,9 @@ struct _slp_tree {
   unsigned int lanes;
   /* The operation of this node.  */
   enum tree_code code;
+  /* Whether uses of this load or feeders of this store are suitable
+     for load/store-lanes.  */
+  bool ldst_lanes;

   int vertex;

Additional information

ewlu / gcc-precommit-ci