RISCV optimization on checking certain bits set ((x & mask) == val)

It might be common in the C family of languages to check if certain bits are set in an integer with a code pattern like this:

unsigned int x;
if ((x & 0x3000) == 0x1000) {
  // Do something...
}

And I am surprised when compilers like GCC and Clang didn't realize they can use some bit shifts and inversions of bit masks to save some instructions and emit smaller code.

Here I present 3 possible optimizations that could be implemented in a compiler. Two of them can apply not only to RISC-V, but other RISC architectures as well (except ARM, perhaps). The last one is specific to RISC-V due to the 20-bit immediate operand of its "lui" (load upper immediate) instruction.

The bit masks should be compile time constants, and the "-Os" flag (optimize for size) is assumed.

Test code

The example code and constants are crafted specifically for RISC-V.

Each group of pred* functions should function identically (if not, please let me know; it might be a typo).

The "a" variants are what I commonly write for checking the set bits.
The "b" variants are what I believe the compiler should ideally transform the code to. I wrote them to let compiler developers know how the optimization can be done. (But in practice the "b" code might transform to "a", meaning the "optimization" direction reversed.)
The "c" variants are hacks to make things work. They contain __asm__ volatile directives to force GCC or Clang to optimize in the direction I want. The generated assembly should present what I considered the ideal result.

#include <stdbool.h>
#include <stdint.h>
#define POWER_OF_TWO_FACTOR(x) ((x) & -(x))

// -------------------------------------------------------------------
// Example 1: The bitwise AND mask contains lower bits in all ones.
// By converting the bitwise AND into a bitwise OR, an "addi"
// instruction can be saved.
// (This might conflict with optimizations utilizing RISC-V "bclri"
// instruction; use one or the other.)
// (In ARM there are "bic" instructions already, making this
// optimization useless.)

static uint32_t mask1 = 0x55555FFF;
static uint32_t val1  = 0x14501DEF;
// static_assert((mask1 & val1) == val1);
// static_assert((mask1 & 0xFFF) == 0xFFF);

bool pred1a(uint32_t x) {
  return ((x & mask1) == val1);
}

bool pred1b(uint32_t x) {
  return ((x | ~mask1) == (val1 | ~mask1));
}

bool pred1c(uint32_t x) {
  register uint32_t temp = x | ~mask1;
  __asm__ volatile ("" : "+r" (temp));
  return (temp == (val1 | ~mask1));
}

// -------------------------------------------------------------------
// Example 2: The bitwise AND mask could fit an 11-bit immediate
// operand of RISC-V "andi" instruction with a help of right
// shifting. (Keep the sign bit of the immediate operand zero.)
// (This kind of optimization could also work with other RISC 
// architectures, except ARM.)

static uint32_t mask2 = 0x55500000;
static uint32_t val2  = 0x14500000;

// static_assert(mask2 != 0);
// static_assert((mask2 & val2) == val2);
// static_assert(mask2 / POWER_OF_TWO_FACTOR(mask2) <= 0x7FF);

bool pred2a(uint32_t x) {
  return ((x & mask2) == val2);
}

bool pred2b(uint32_t x) {
  uint32_t factor = POWER_OF_TWO_FACTOR(mask2);
  return ((x >> __builtin_ctz(factor)) & (mask2 / factor))
    == (val2 / factor);
}

bool pred2c(uint32_t x) {
  uint32_t factor = POWER_OF_TWO_FACTOR(mask2);
  register uint32_t temp = x >> 20;
  __asm__ volatile ("" : "+r" (temp));
  return (temp & 0x555) == 0x145;
}

// -------------------------------------------------------------------
// Example 3: The bitwise AND mask could fit a 20-bit immediate
// operand of RISC-V "lui" instruction.
// Only RISC-V has this 20-bit immediate "U-type" format, AFAIK.

static uint32_t mask3 = 0x00055555;
static uint32_t val3  = 0x00045014;

// static_assert(mask3 / POWER_OF_TWO_FACTOR(mask3) <= 0xFFFFF);

bool pred3a(uint32_t x) {
  return ((x & mask3) == val3);
}

bool pred3b(uint32_t x) {
  uint32_t factor = POWER_OF_TWO_FACTOR(mask3);
  return (((x / factor) << 12) & ((mask3 / factor) << 12))
    == ((val3 / factor) << 12);
}

bool pred3c(uint32_t x) {
  uint32_t factor = POWER_OF_TWO_FACTOR(mask3);
  register uint32_t temp = x << 12;
  __asm__ volatile ("" : "+r" (temp));
  return (temp & ((mask3 / factor) << 12))
    == ((val3 / factor) << 12);
}

I tested the code in the Compiler Explorer (godbolt.org).

Note: I also reported the issue in GCC bug tracker, the link is here in case you are interested: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114087

llvm / llvm-project

RISCV optimization on checking certain bits set ((x & mask) == val) #82875

Test code