gfx-rs / wgpu-native

Native WebGPU implementation based on wgpu-core
Apache License 2.0
885 stars 105 forks source link

Complete Timestamp Query feature implementation #439

Closed zackgomez closed 1 month ago

zackgomez commented 1 month ago

Expose the two subfeatures as native features. If these feature definitions move to upstream webgpu.h in the future, it should be a trivial change to change from a native to standard feature.

Test Plan

Verify timestamp writes work in the following cases/apis: ✅Compute pipeline creation timestampWrites (WGPUFeature_TimestampQuery) ✅wgpuCommandEncoderWriteTimestamp (WGPUNativeFeature_TimestampQueryInsideEncoders) ✅wgpuComputePassEncoderWriteTimestamp (WGPUNativeFeature_TimestampQueryInsidePasses)

output

query time: 7872 ns
copy time: 7168 ns
inter pass time: 2080 ns

modified examples/compute/main.c

// [...]

static const uint32_t ENCODER_QUERY_START = 0;
static const uint32_t ENCODER_QUERY_END = 1;
static const uint32_t COMPUTE_PASS_QUERY_START = 2;
static const uint32_t COMPUTE_PASS_QUERY_END = 3;
static const uint32_t INTER_PASS_QUERY_START = 4;
static const uint32_t INTER_PASS_QUERY_END = 5;
static const uint32_t QUERY_COUNT = 6;

int main(int argc, char *argv[]) {

// [...]

  WGPUQuerySet query_set = wgpuDeviceCreateQuerySet(device, &(const WGPUQuerySetDescriptor){
    .label = "query_set",
    .type = WGPUQueryType_Timestamp,
    .count = QUERY_COUNT,
  });
  assert(query_set);

  uint64_t query_buffer_size = QUERY_COUNT * sizeof(uint64_t);
  WGPUBuffer query_buffer = wgpuDeviceCreateBuffer(
      device, &(const WGPUBufferDescriptor){
                  .label = "query_buffer",
                  .usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc,
                  .size = query_buffer_size,
                  .mappedAtCreation = false,
              });
  assert(query_buffer);

  uint64_t storage_buffer_size = numbers_size;
  WGPUBuffer storage_buffer = wgpuDeviceCreateBuffer(
      device, &(const WGPUBufferDescriptor){
                  .label = "storage_buffer",
                  .usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
                           WGPUBufferUsage_CopySrc,
                  .size = storage_buffer_size,
                  .mappedAtCreation = false,
              });
  assert(storage_buffer);

  uint64_t staging_buffer_size = storage_buffer_size + query_buffer_size;
  WGPUBuffer staging_buffer = wgpuDeviceCreateBuffer(
      device, &(const WGPUBufferDescriptor){
                  .label = "staging_buffer",
                  .usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst,
                  .size = staging_buffer_size,
                  .mappedAtCreation = false,
              });
  assert(staging_buffer);

// [...]

  WGPUCommandEncoder command_encoder = wgpuDeviceCreateCommandEncoder(
      device, &(const WGPUCommandEncoderDescriptor){
                  .label = "command_encoder",
              });
  assert(command_encoder);

  WGPUComputePassTimestampWrites timestamp_writes = {
    .querySet = query_set,
    .beginningOfPassWriteIndex = COMPUTE_PASS_QUERY_START,
    .endOfPassWriteIndex = COMPUTE_PASS_QUERY_END,
  };
  WGPUComputePassEncoder compute_pass_encoder =
      wgpuCommandEncoderBeginComputePass(command_encoder,
                                         &(const WGPUComputePassDescriptor){
                                             .label = "compute_pass",
                                             .timestampWrites = &timestamp_writes,
                                         });
  assert(compute_pass_encoder);

  wgpuComputePassEncoderWriteTimestamp(compute_pass_encoder, query_set, INTER_PASS_QUERY_START);

  wgpuComputePassEncoderSetPipeline(compute_pass_encoder, compute_pipeline);
  wgpuComputePassEncoderSetBindGroup(compute_pass_encoder, 0, bind_group, 0,
                                     NULL);

  wgpuComputePassEncoderWriteTimestamp(compute_pass_encoder, query_set, INTER_PASS_QUERY_END);

  wgpuComputePassEncoderDispatchWorkgroups(compute_pass_encoder, numbers_length, 1, 1);

  wgpuComputePassEncoderEnd(compute_pass_encoder);
  wgpuComputePassEncoderRelease(compute_pass_encoder);

  wgpuCommandEncoderWriteTimestamp(command_encoder, query_set, ENCODER_QUERY_START);

  wgpuCommandEncoderCopyBufferToBuffer(command_encoder, storage_buffer, 0,
                                       staging_buffer, 0, numbers_size);

  wgpuCommandEncoderWriteTimestamp(command_encoder, query_set, ENCODER_QUERY_END);

  wgpuCommandEncoderResolveQuerySet(command_encoder, query_set, 0, QUERY_COUNT, query_buffer, 0);
  wgpuCommandEncoderCopyBufferToBuffer(command_encoder, query_buffer, 0,
      staging_buffer, numbers_size, query_buffer_size);

// [...]

  uint32_t *buf =
      (uint32_t *)wgpuBufferGetMappedRange(staging_buffer, 0, numbers_size);
  assert(buf);
  uint64_t *query_buf = (uint64_t *)wgpuBufferGetMappedRange(staging_buffer, numbers_size, query_buffer_size);
  assert(query_buf);

  printf("times: [%d, %d, %d, %d]\n", buf[0], buf[1], buf[2], buf[3]);
  printf("compute pass time: %ld ns\n", query_buf[COMPUTE_PASS_QUERY_END] - query_buf[COMPUTE_PASS_QUERY_START]);
  printf("inter pass time: %ld ns\n", query_buf[INTER_PASS_QUERY_END] - query_buf[INTER_PASS_QUERY_START]);
  printf("copy time: %ld ns\n", query_buf[ENCODER_QUERY_END] - query_buf[ENCODER_QUERY_START]);

// [...]
}
zackgomez commented 1 month ago

rebased and fixed formatting issue