code for the optimization of svbrdf textures crashes with both llvm and cuda variants after hundreds of renders.

osylum commented 1 year ago

Summary

optimization code for svbrdf crashes with both llvm and cuda variants after hundreds of renders.

System configuration

OS: Windows-10
CPU: Intel64 Family 6 Model 165 Stepping 5, GenuineIntel
GPU: NVIDIA RTX A4000
Python: 3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)] NVidia driver: 517.40
CUDA: 10.0.130 LLVM: 15.-1.-1

Dr.Jit: 0.4.1 Mitsuba: 3.2.1 Is custom build? False Compiled with: MSVC 19.34.31942.0 Variants: scalar_rgb scalar_spectral cuda_ad_rgb llvm_ad_rgb

Description

I have an optimization loop to optimization the bsdf textures of a custom BSDF. I have a dataset with 2414 images. I tried both with prb and direct integrator. Render my dataset using the same scene works fine. The program crashes in dr.backward_from call.

When using the cuda variant, I get the following errors (not always the same) after some hundreds of renders:


Critical Dr.Jit compiler failure: cuda_check(): API error 0700 (CUDA_ERROR_ILLEGAL_ADDRESS): "an illegal memory access was encountered" in D:\a\drjit\drjit\ext\drjit-core\src\util.cpp:203.


RuntimeError: jit_malloc(): out of memory! Could not allocate 8388608 bytes of device memory.

when using llvm, it also crashes with a longer error message. It is very long so that I extracted part of it. Let me know if you need the full message and in that case how to attach a text file to my comments.


Critical Dr.Jit compiler failure: jit_llvm_compile(): parsing failed. Please see the LLVM IR and error message below:

define void @drjit_4239a778091d697a3bfdf1c536306938(i64 %start, i64 %end, ptr noalias %params) #0 {
entry:
    %callables = load ptr, ptr @callables, align 8
    %buffer = alloca i8, i32 696, align 32
    br label %body

body:
    %index = phi i64 [ %index_next, %suffix ], [ %start, %entry ]
    %f1_1 = insertelement <8 x float> undef, float 0xc012bffd00000000, i32 0
    %f1 = shufflevector <8 x float> %f1_1, <8 x float> undef, <8 x i32> zeroinitializer
...
    br label %l278_start

l278_start:
    ; VCall: mitsuba::Shape::compute_surface_interaction()
    %u278_self_ptr = getelementptr i64, ptr %rd276, <8 x i32> %r265
    %u278_self_combined = call <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr> %u278_self_ptr, i32 8, <8 x i1> %p275, <8 x i64> zeroinitializer)
    %u278_self_initial = trunc <8 x i64> %u278_self_combined to <8 x i32>
    %u278_offset_1 = lshr <8 x i64> %u278_self_combined, 
...
l278_check:
    %u278_self = phi <8 x i32> [ %u278_self_initial, %l278_start ], [ %u278_self_next, %l278_call ]
    %u278_next = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %u278_self)
    %u278_valid = icmp ne i32 %u278_next, 0
    br i1 %u278_valid, label %l278_call, label %l278_end

l278_call:
    %u278_bcast_0 = insertelement <8 x i32> undef, i32 %u278_next, i32 0
    %u278_bcast = shufflevector <8 x i32> %u278_bcast_0, <8 x i32> undef, <8 x i32> zeroinitializer
    %u278_active = icmp eq <8 x i32> %u278_self, %u278_bcast
...
l278_end:
    %u278_out_0_1 = getelementptr inbounds i8, ptr %u278_out, i64 0
    %f279 = load <8 x float>, ptr %u278_out_0_1, align 32
    %u278_out_1_1 = getelementptr inbounds i8, ptr %u278_out, i64 32
    %f289 = load <8 x float>, ptr %u278_out_1_1, align 32
...
l278_done:
    %rd295_p1 = getelementptr inbounds ptr, ptr %params, i32 40
    %rd295 = load ptr, ptr %rd295_p1, align 8, !alias.scope !2
    %p296 = xor <8 x i1> %p251, 
    %f297_1 = insertelement <8 x float> undef, float 0x7ff0000000000000, i32 0
    %f297 = shufflevector <8 x float> %f297_1, <8 x float> undef, <8 x i32> zeroinitializer
...
l401_check:
    %u401_self = phi <8 x i32> [ %u401_self_initial, %l401_start ], [ %u401_self_next, %l401_call ]
    %u401_next = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %u401_self)
    %u401_valid = icmp ne i32 %u401_next, 0
    br i1 %u401_valid, label %l401_call, label %l401_end

l401_call:
    %u401_bcast_0 = insertelement <8 x i32> undef, i32 %u401_next, i32 0
    %u401_bcast = shufflevector <8 x i32> %u401_bcast_0, <8 x i32> undef, <8 x i32> zeroinitializer
    %u401_active = icmp eq <8 x i32> %u401_self, %u401_bcast
...
l401_done:
    %f411_1 = insertelement <8 x float> undef, float 0x0, i32 0
    %f411 = shufflevector <8 x float> %f411_1, <8 x float> undef, <8 x i32> zeroinitializer
    %p412 = fcmp one <8 x float> %f398, %f411
    %p413 = and <8 x i1> %p316, %p412
...
...
define void @func_1e8a2a49e342d8830f8416dd98877800(<8 x i1> %mask, <8 x i32> %self, ptr noalias %params, ptr noalias %data, <8 x i32> %offsets) #0 {
entry:
    ; VCall: mitsuba::Shape::compute_surface_interaction()
    %f1_i1 = getelementptr inbounds i8, ptr %params, i64 0
    %f1 = load <8 x float>, ptr %f1_i1, align 32
    %r2 = bitcast <8 x i32> %self to <8 x i32>
...
declare <8 x float> @llvm.fabs.v8f32(<8 x float>)

declare <8 x float> @llvm.masked.gather.v8f32(<8 x ptr>, i32, <8 x i1>, <8 x float>)

declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)

@callables = dso_local local_unnamed_addr global ptr null, align 8

declare <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr>, i32, <8 x i1>, <8 x i32>)

declare <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr>, i32, <8 x i1>, <8 x i64>)

define void @set_callables(ptr %ptr) local_unnamed_addr #0 {
    store ptr %ptr, ptr @callables
    ret void
}

declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)

declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1>)

declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)

define internal void @reduce_fadd_f32(<8 x ptr> %ptr, <8 x float> %value, <8 x i1> %active_in) #0 {
L0:
   br label %L1

L1:
   %index = phi i32 [ 0, %L0 ], [ %index_next, %L3 ]
   %active = phi <8 x i1> [ %active_in, %L0 ], [ %active_next_2, %L3 ]
   %active_i = extractelement <8 x i1> %active, i32 %index
   br i1 %active_i, label %L2, label %L3

L2:
   %ptr_0 = extractelement <8 x ptr> %ptr, i32 %index
   %ptr_1 = insertelement <8 x ptr> undef, ptr %ptr_0, i32 0
   %ptr_2 = shufflevector <8 x ptr> %ptr_1, <8 x ptr> undef, <8 x i32> zeroinitializer
   %ptr_eq = icmp eq <8 x ptr> %ptr, %ptr_2
   %active_cur = and <8 x i1> %ptr_eq, %active
   %value_cur = select <8 x i1> %active_cur, <8 x float> %value, <8 x float> zeroinitializer
   %sum = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float -0.0, <8 x float> %value_cur)
   atomicrmw fadd ptr %ptr_0, float %sum monotonic
   %active_next = xor <8 x i1> %active, %active_cur
   %active_red = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %active_next)
   br i1 %active_red, label %L3, label %L4

L3:
   %active_next_2 = phi <8 x i1> [ %active, %L1 ], [ %active_next, %L2 ]
   %index_next = add nuw nsw i32 %index, 1
   %cond_2 = icmp eq i32 %index_next, 8
   br i1 %cond_2, label %L4, label %L1

L4:
   ret void
}

declare <8 x float> @llvm.floor.v8f32(<8 x float>)

declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)

declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)

!0 = !{!0}
!1 = !{!1, !0}
!2 = !{!1}
!3 = !{i32 1}
!4 = !{!"llvm.loop.unroll.disable", !"llvm.loop.vectorize.enable", i1 0}

attributes #0 = { norecurse nounwind "frame-pointer"="none" "no-builtins" "no-stack-arg-probe" "target-cpu"="skylake" "target-features"="-vzeroupper,-avx512pf,-tsxldtrk,+cx16,+sahf,-tbm,-avx512ifma,-sha,+crc32,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-avx512bf16,-amx-tile,-uintr,-gfni,+popcnt,-ptwrite,+aes,-avx512bitalg,-movdiri,-widekl,+xsaves,-avx512er,-avxvnni,-avx512fp16,-avx512vnni,-amx-bf16,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,+xsavec,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-kl,-movdir64b,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,-serialize,-hreset,+invpcid,-avx512cd,+avx,-vaes,-amx-int8,+cx8,+fma,-rtm,+bmi,-enqcmd,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,+fxsr,-wbnoinvd,+sse,+lzcnt,+pclmul,-rdpru,+f16c,+ssse3,-sgx,-prefetchwt1,+cmov,-avx512vbmi,-shstk,+movbe,-avx512vp2intersect,+xsaveopt,-avx512dq,+sse2,+adx,+sse3" }

drjit_4239a778091d697a3bfdf1c536306938:1890:23: error: use of undefined value '%f0'
    store <8 x float> %f0, ptr %f0_p5, align 32, !noalias !2, !nontemporal !3
                      ^

Process finished with exit code -1073740791 (0xC0000409)

Steps to reproduce

I don't have currently a minimal example to be reproduced without the dataset and helper tools that I am using, but here are selected parts of the code.

setting optimizer:


    lr_rate = 0.01
    optimizer = mi.ad.Adam(lr=lr_rate)
    for key in keys:
        optimizer[key] = scene_params[key]  # needs to be differentiable
    scene_params.update(optimizer)

loop optimization:


    losses = []
    intermediate_images = []
    start_time = timer()
    for stage in range(num_stages):
        print(f"Stage {stage + 1:02d}")

        start_time_stage = timer()
        for it in range(num_iterations_per_stage):
            print(f"Iter {it}")
            total_loss = 0.0
            images = []

            start_local = timer()
            for i, sample_infos in enumerate(dataset_infos):

                start_sample = timer()

                scene_params = mi.traverse(scene)
                scene_params['sensor.x_fov'] = sample_infos['sensor_fov']
                scene_params['sensor.to_world'] = sample_infos['sensor_to_world']
                scene_params['emitter.position'] = sample_infos['emitter_position']
                scene_params.update()

                image = mi.render(scene, scene_params)  # passing params is important here

                image_true = images_true[i]

                if i == 0:
                    filepath = os.path.join(output_path, f"comparison_stage{stage:02d}_iteration{it:02d}.jpg")
                    compare_images(image, f"stage{stage:02d}_iteration{it:02d}", images_true[0], 'Reference', filepath)

                # compute loss
                loss = dr.mean(dr.abs(image[:,:,0:3] - image_true)) 
                dr.backward_from(loss) 
                total_loss += loss[0]

                # Store images at the end of every stage
                if it == num_iterations_per_stage - 1:
                    dr.eval(image)
                    images.append(image)

                if i%40 == 0:
                    print(f"elapsed time sample {i}: {timer() - start_sample}")

            losses.append(total_loss)
            optimizer.step()
            scene_params.update(optimizer)
            print(f"total loss: {total_loss}")

            print(f"elapsed time to render dataset: {timer() - start_local}")

I setup the cuda variant in every file that I include in my main application, but I am not sure it is the correct way. I had tried doing so in the main application only, but it created more problems, in particular not setting the variant in my custom BSDF file.


import mitsuba as mi
#mi.set_variant('cuda_ad_rgb')
mi.set_variant('llvm_ad_rgb')

I register my custom BSDF when creating the bsdf dict for my scene, but not in my bsdf python file. I am not sure it is correct and can cause problems. I remember registering all my bsdf in my bsdf python file (after the class definition) created problems.

njroussel commented 1 year ago

Hi @osylum

This sounds exactly like this issue: https://github.com/mitsuba-renderer/drjit-core/pull/58 The fix has made its way upstream to mitsuba but is not yet available in the pip wheels. You will need to build the project yourself.

osylum commented 1 year ago

I see, building the project is on my list regarding the cuda crash anyway from a previous thread, so I will tackle that. Thanks

njroussel commented 1 year ago

I'll close this issue for now, I'm convinced it's a duplicate.

Please keep this thread updated if it did not solve your issue -- I'll re-open the issue then.

osylum commented 1 year ago

I checked out and compiled mitsuba3 sources as explained in: https://mitsuba.readthedocs.io/en/stable/src/developer_guide/compiling.html

I used VS2022 and cmake 3.22.2

Dr.Jit: 0.4.1 Mitsuba: 3.2.1 Is custom build? False Compiled with: MSVC 19.32.31329.0 Variants: scalar_rgb scalar_spectral cuda_ad_rgb llvm_ad_rgb

Then I simplified also my code up to the point of creating an example that you should be able to run (see down below).

Unfortunately even the simplified example below creates the same type of crash.

Here are also some details about the python env I used:

Adding the mitsuba and drjit folders into a python env did not work. I created a wheel as explained in: https://github.com/mitsuba-renderer/mitsuba3/discussions/407


"C:\Python39\python.exe" -m virtualenv test_optimize_svbrdf
pip install "mitsuba-3.2.1-cp39-cp39-win_amd64.whl"
pip install imageio
pip install matplotlib==3.5.2
pip install pipwin
pip install "OpenEXR-1.3.8-cp39-cp39-win_amd64.whl"
pip install pyexr
pip install opencv-python
pip install scipy
pip install scikit-image
pip install pandas


import os, sys
import importlib
import gc
import shutil

from timeit import default_timer as timer
import datetime

import drjit as dr
import mitsuba as mi

print('available mitsuba variants:', mi.variants())
# mi.set_variant("scalar_rgb")
mi.set_variant('llvm_ad_rgb')
# mi.set_variant('cuda_ad_rgb')

import pyexr  # note: first install openexr with wheel file

os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "True"
import cv2 as cv

import numpy as np
from scipy import stats

import imageio
import matplotlib.pyplot as plt

from scipy.spatial.transform import Rotation
import skimage

import pandas as pd

def dot(x, y):
    return np.sum(x * y, -1, keepdims=True)

def length_epsilon(x, eps=1e-20):
    xdx = dot(x, x)
    xdx = xdx.clip(eps, xdx)
    return np.sqrt(xdx)

def normalize_epsilon(x, eps=1e-20):
    return x / (length_epsilon(x, eps) + eps)

def create_bsdf(bsdf_params):

    if bsdf_params['type'] == 'disney':
        bsdf = {
            "type": "principled",
            "base_color": { "type": "bitmap", "filename": bsdf_params['base_color_texture_path'], "raw": True},
            "roughness": { "type": "bitmap", "filename": bsdf_params['roughness_texture_path'],  "raw": True},
            "metallic": { "type": "bitmap", "filename": bsdf_params['metallic_texture_path'], "raw": True},
            "specular": 0.5,
        }

    elif bsdf_params['type'] == 'disney_aov':
        bsdf = {
            "type": "principled",
            "base_color": np.pi,
            "roughness": 1.0,
            "metallic": 0.0,
            "specular": 0.0,
        }

    elif bsdf_params['type'] == 'plastic':  # test only
        bsdf = mi.load_dict({
            'type': 'plastic',
        })

    elif bsdf_params['type'] == 'diffuse':  # test only
        bsdf ={
            'type': 'diffuse',
            'reflectance': 0.5,
        }

    else:
        raise Exception(f"unknown bsdf type: {bsdf_params['type']}")

    normal_map_enabled = False
    if 'normal_texture_path' in bsdf_params:
        normal_map_enabled = os.path.exists(bsdf_params['normal_texture_path'])

    if normal_map_enabled:
        bsdf = {
            "type": "normalmap",
            "normalmap": {
                "type": "bitmap",
                "filename": bsdf_params['normal_texture_path'],
                "raw": True,
            },
            "bsdf": bsdf
        }

    return bsdf

def create_object(object_params):
    # create object
    if object_params['type'] == 'sphere':
        sphere_center = object_params['center']
        sphere_radius = object_params['radius']
        geo = {
            'type': 'sphere',
            'center': sphere_center,
            'radius': sphere_radius,
            'bsdf': object_params['bsdf']
        }
    else:
        raise Exception(f"unknown object type: {object_params['type']}")

    return geo

def create_emitter(emitter_params):
    emitter = mi.load_dict({
        'type': 'point',
        "position": [0, 0, 0],
        "intensity": 10000.0,
    })
    return emitter

def create_sensor(sensor_params):
    fov = sensor_params['fov']
    to_world = sensor_params['to_world']
    width = sensor_params['film_width']
    height = sensor_params['film_height']
    if sensor_params['film_rfilter'] == 'box': # for aov
        rfilter = {'type': 'box'}
    else:
        rfilter = {"type": "gaussian", "stddev": 0.25}
        # rfilter = {"type": "lanczos"}
    sensor = mi.load_dict({
        "type": "perspective",
        "fov_axis": "x",
        "fov": fov,
        "to_world": to_world,
        "film": {
            "type": "hdrfilm",
            "width": width,
            "height": height,
            "pixel_format": "rgba",
            "rfilter": rfilter,
        },
    })
    return sensor

def create_integrator(integrator_params):
    if integrator_params['type'] == 'beauty':
        integrator = mi.load_dict({
            "type": "direct",
            "hide_emitters": True,
        })
    elif integrator_params['type'] == 'aov':
        integrator = mi.load_dict({
            "type": "aov",
            "aovs": "pp:position,nn:sh_normal,uv:uv",
            "shadow_integrator": {
                "type": "direct",
                "hide_emitters": True,
            },
        })
    elif integrator_params['type'] == 'prb':
        integrator = mi.load_dict({
            "type": "prb",
            "hide_emitters": True,
        })
    else:
        raise Exception(f"unknown integrator type: {integrator_params['type']}")

    return integrator

def create_scene(bsdf_params, object_params, emitter_params, sensor_params, integrator_params):

    # create bsdf
    bsdf = create_bsdf(bsdf_params)

    # create object
    object_params['bsdf'] = bsdf
    geo = create_object(object_params)

    # create emitter
    emitter = create_emitter(emitter_params)

    # create sensor
    sensor = create_sensor(sensor_params)

    # create integrator
    integrator = create_integrator(integrator_params)

    # scene
    scene = mi.load_dict(
        {
            "type": "scene",
            "emitter": emitter,
            "object": geo,
            "sensor": sensor,
            "integrator": integrator
        }
    )

    return scene, sensor

def create_disney_brdf_textures_uniform(base_color=np.array([0.5, 0.5, 0.5]),
                                          metallic=0.5,
                                          roughness=0.15,
                                          texture_res=256,
                                          textures_data_directory=''):
    """
    Helper to create uniform cooktorrance brdf textures

    Args:
        base_color              : base color
        metallic                : metallicity
        roughness               : roughness
        texture_res             : texture resolution
        textures_data_directory : output path for saving textures
    """

    texture_ones = np.ones((texture_res, texture_res, 3))
    texture_base_color = texture_ones * base_color
    texture_metallic = texture_ones[:, :, 0] * metallic
    texture_roughness = texture_ones[:, :, 0] * roughness

    base_color_texture_path = f"{textures_data_directory}/base_color.exr"
    metallic_texture_path = f"{textures_data_directory}/metallic.exr"
    roughness_texture_path = f"{textures_data_directory}/alpha.exr"

    if os.path.exists(textures_data_directory):
        print(f"exporting textures to {textures_data_directory}")
        if False:
            imageio.imwrite(base_color_texture_path, texture_base_color.astype(dtype=np.float32))
            imageio.imwrite(metallic_texture_path, texture_metallic.astype(dtype=np.float32))
            imageio.imwrite(roughness_texture_path, texture_roughness.astype(dtype=np.float32))
            # this produces wrong colors
            #pyexr.write(base_color_texture_path, texture_base_color.astype(dtype=np.float32), channel_names = ['X', 'Y', 'Z'])
            #pyexr.write(metallic_texture_path, texture_metallic.astype(dtype=np.float32), channel_names = ['Y'])
            #pyexr.write(roughness_texture_path, texture_roughness.astype(dtype=np.float32), channel_names = ['Y'])

        else:
            mi.Bitmap(texture_base_color.astype(dtype=np.float32)).write(base_color_texture_path)
            mi.Bitmap(texture_metallic.astype(dtype=np.float32)).write(metallic_texture_path)
            mi.Bitmap(texture_roughness.astype(dtype=np.float32)).write(roughness_texture_path)

    return texture_base_color, texture_metallic, texture_roughness

# convenience function to plot images
def plot_list(images, title=None, savepath=None):
    numimages = len(images)
    numcols = 5
    numrows = int(numimages / numcols)
    if numrows * numcols < numimages:
        numrows += 1
    plt.clf()
    fig, axs = plt.subplots(numrows, numcols, figsize=(numcols * 4, numrows * 3))
    ax_flatten = axs.flatten()
    for i in range(numimages):
        ax_flatten[i].imshow(mi.util.convert_to_bitmap(images[i]))
        ax_flatten[i].axis('off')
    for i in range(numrows * numcols - numimages):
        ax_flatten[i + numimages].axis('off')
    if title is not None:
        plt.suptitle(title)
    plt.tight_layout()
    if not (savepath is None):
        plt.savefig(savepath)
    else:
        plt.show()

def spherical_to_cartesian(phi, theta):
    return np.array([np.cos(phi)*np.sin(theta), np.sin(phi)*np.sin(theta), np.cos(theta)])

def get_cameras_positions(center = np.zeros(3), dist = 40.5,
                          phi_numsteps = 4, theta_numsteps = 2):
    phis = np.linspace(0.,np.pi*2., phi_numsteps+1)[:-1]
    thetas = np.linspace(0.,np.pi,theta_numsteps+1)
    directions = [
        spherical_to_cartesian(phi, theta)
        for phi in phis
        for theta in thetas[1:-1]
    ]
    directions.append(spherical_to_cartesian(0.,0.))
    directions.append(spherical_to_cartesian(0.,np.pi))
    directions = np.array(directions)
    print(f"{np.shape(directions)=}")
    positions = center[np.newaxis,:] + directions*dist
    print(f"{np.shape(positions)=}")
    return positions

def get_lights_positions(center = np.zeros(3), dist = 49.,
                          phi_numsteps = 4, theta_numsteps = 2):
    phis = np.linspace(0.,np.pi*2., phi_numsteps+1)[:-1]
    thetas = np.linspace(0.,np.pi,theta_numsteps+1)
    directions = [
        spherical_to_cartesian(phi, theta)
        for phi in phis
        for theta in thetas[1:-1]
    ]
    directions.append(spherical_to_cartesian(0.,0.))
    directions.append(spherical_to_cartesian(0.,np.pi))
    directions = np.array(directions)
    positions = center[np.newaxis,:] + directions*dist
    return positions

def get_dataset_infos(render_resolution,
                      bsdf_type = 'disney',
                      scene_center = np.ones(3),
                      camera_dist = 40.5,
                      light_dist = 49.,
                      phi_numsteps = 3,
                      theta_numsteps = 2):
    cameras_positions = get_cameras_positions(scene_center, camera_dist, phi_numsteps, theta_numsteps)
    lights_positions = get_lights_positions(scene_center, light_dist, phi_numsteps, theta_numsteps)

    camera_fov = 38.8839
    #[[camera_position[0], camera_position[1], camera_position[2]]]
    dataset_infos = []
    for olat_frame, emitter_position in enumerate(lights_positions):
        for camera_name, camera_position in enumerate(cameras_positions):
            origin = mi.ScalarPoint3f(camera_position[0], camera_position[1], camera_position[2])
            target = mi.ScalarPoint3f(scene_center[0], scene_center[1], scene_center[2])
            up = mi.ScalarPoint3f(0.,0.,1.)
            camera_to_world = mi.ScalarTransform4f.look_at(
                origin=origin,
                target=target,
                up=up
            )
            sensor_direction = normalize_epsilon(camera_position - scene_center)
            sensor_to_world = camera_to_world.matrix.numpy()
            emitter_direction = normalize_epsilon(emitter_position - scene_center)
            dataset_infos.append({
                'bsdf_type': bsdf_type,
                'render_resolution': render_resolution,
                'olat_frame':olat_frame,
                'camera_name': camera_name,
                'sensor_fov': camera_fov,
                'sensor_to_world': sensor_to_world,
                'sensor_direction': sensor_direction,
                'emitter_position': emitter_position,
                'emitter_direction': emitter_direction,
            })
    print(f'number of samples in dataset: {len(dataset_infos)}')
    return dataset_infos

def optimize_svbrdf(output_root_dir):
    """
    Test for the optimization of svbrdf parameters
    """

    bsdf_type = 'disney'
    num_stages = 5
    num_iterations_per_stage = 1
    render_resolution = [256, 256] #[864, 486]
    image_res_max = np.array(render_resolution)
    scene_center = np.array([0.,0.,0.])
    camera_dist = 40.5
    light_dist = 49.
    phi_numsteps = 4
    theta_numsteps = 3

    # create dataset infos
    dataset_infos = get_dataset_infos(render_resolution, bsdf_type,
                                      scene_center, camera_dist, light_dist,
                                      phi_numsteps, theta_numsteps)
    df = pd.DataFrame(dataset_infos)
    print(df.head())

    image_res_min = image_res_max

    texture_res_max = 256
    texture_res_min = texture_res_max
    texture_res = texture_res_max

    # load material parameters from merl
    if bsdf_type == 'disney':
        base_color = np.ones(3) * 0.5
        metallic = 0.5
        roughness = 0.15
        specular = 0.5

    # set output path
    # -------------------------
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    output_path = os.path.join(output_root_dir, now) if output_root_dir is not None else None
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        print(f'output path: {output_path}')

    # create default textures
    # ------------------------------
    if bsdf_type == 'disney':
        textures_data_directory = os.path.join('textures')
        if not os.path.exists(textures_data_directory):
            os.makedirs(textures_data_directory)
        create_disney_brdf_textures_uniform(base_color=np.array([0.8, 0.2, 0.1]),
                                          metallic=0.25,
                                          roughness=np.sqrt(0.01),
                                            textures_data_directory=textures_data_directory)
        base_color_texture_path = f"{textures_data_directory}/base_color.exr"
        roughness_texture_path = f"{textures_data_directory}/alpha.exr"
        metallic_texture_path = f"{textures_data_directory}/metallic.exr"
        #normal_texture_path = f"{}textures_data_directory/normal.exr"

    # create scene
    # -------------------------
    object_params = {'type': 'sphere', 'center': scene_center, 'radius': 5.}
    if bsdf_type == 'disney':
        bsdf_params = {
            'type': bsdf_type,
            'base_color_texture_path': base_color_texture_path,
            'roughness_texture_path': roughness_texture_path,
            'metallic_texture_path': metallic_texture_path,
            #'normal_texture_path': normal_texture_path,
        }
    scene, sensor = create_scene(bsdf_params=bsdf_params,
                                object_params=object_params,
                                emitter_params={},
                                sensor_params={
                                    'fov': 38.49,
                                    'to_world': mi.ScalarTransform4f(),
                                    'film_width': render_resolution[0],
                                    'film_height': render_resolution[1],
                                    'film_rfilter': 'guaussian'
                                },
                                integrator_params={'type': 'beauty'})  # prb, beauty
    scene_params = mi.traverse(scene)
    print(f"scene params: {scene_params}")

    # access param and define keys to change params values
    scene_params = mi.traverse(scene)
    print(scene_params)
    if bsdf_type == 'disney':
        key_base_color = 'object.bsdf.base_color.data'
        key_roughness = 'object.bsdf.roughness.data'
        key_metallic = 'object.bsdf.metallic.data'
        #key_base_color = 'object.bsdf.nested_bsdf.base_color.data'
        #key_roughness = 'object.bsdf.nested_bsdf.roughness.data'
        #key_metallic = 'object.bsdf.nested_bsdf.metallic.data'
        keys = [key_base_color, key_roughness, key_metallic]

    # load reference images
    print('loading reference images')
    images_true = []
    start = timer()
    print(f"number of images in dataset: {len(dataset_infos)}")
    for i, sample_infos in enumerate(dataset_infos):

        scene_params = mi.traverse(scene)
        scene_params['sensor.x_fov'] = sample_infos['sensor_fov']
        scene_params['sensor.to_world'] = sample_infos['sensor_to_world']
        scene_params['emitter.position'] = sample_infos['emitter_position']
        scene_params.update()

        image = mi.render(scene, scene_params)

        # image = mi.TensorXf(image)
        images_true.append(image)
        if False:
            plt.axis('off')
            plt.imshow(dr.clip(image, 0.0, 1.0))
            plt.title('reference')
            plt.show()
    print(f"elapsed time: {timer() - start}")
    print('done loading reference images')

    # plot some images of the dataset
    camera_index = 1
    lights_names = olat_frames = df['olat_frame'].unique()
    numlights = len(lights_names)
    filepath = os.path.join(output_path, 'reference')
    plot_list(images_true[camera_index * numlights:(camera_index + 1) * numlights], title="reference images", savepath=filepath)

    # set initial brdf parameters
    texture_ones_3 = np.ones((texture_res, texture_res, 3))
    texture_ones_1 = np.ones((texture_res, texture_res, 3))
    if bsdf_type == 'disney':
        base_color_init = np.ones(3) * 0.5  # np.array([1.,0.2,0.1])
        metallic_init = 0.5
        roughness_init = np.sqrt(0.15)
        texture_base_color_init = mi.TensorXf(texture_ones_3 * base_color_init)
        texture_metallic_init = mi.TensorXf(texture_ones_3 * metallic_init)
        texture_roughness_init = mi.TensorXf(texture_ones_1 * roughness_init)
        textures = [texture_base_color_init, texture_metallic_init, texture_roughness_init]

    if True:
        for key, texture in zip(keys, textures):
            print(f"{key=}")
            scene_params[key] = texture
        # scene_params.keep(keys) # this will delete in the list other parameters not in keys
        scene_params.update()
        # print(f"{scene_params=}")

    # render initial scene
    scene_params["sensor.x_fov"] = dataset_infos[0]['sensor_fov']
    scene_params["sensor.to_world"] = dataset_infos[0]['sensor_to_world']
    scene_params["emitter.position"] = dataset_infos[0]['emitter_position']
    image_init = mi.render(scene, scene_params)

    # conmpare initial with reference
    def compare_images(image_pred, title_pred, image_true, title_true, filepath=None):
        fig, axs = plt.subplots(1, 2, figsize=(10, 4))
        axs[0].imshow(mi.util.convert_to_bitmap(image_pred))
        axs[0].set_title(title_pred)
        axs[0].axis('off')
        axs[1].imshow(mi.util.convert_to_bitmap(image_true))
        axs[1].set_title(title_true)
        axs[1].axis('off')
        plt.tight_layout()
        if filepath is not None:
            plt.savefig(filepath)
        plt.show()

    filepath = os.path.join(output_path, "comparison_init.jpg") if output_path is not None else None
    compare_images(image_init, 'Initial', images_true[0], 'Reference', filepath)

    # create optimizer
    lr_rate = 0.01
    if True:
        optimizer = mi.ad.Adam(lr=lr_rate)
        for key in keys:
            optimizer[key] = scene_params[key]  # needs to be differentiable
        scene_params.update(optimizer)
    else:
        params = scene_params[key_rho_d]
        optimizer = mi.ad.Adam(lr=lr_rate, params=params)
        scene_params.update(optimizer)

    # set starting texture resolution
    if False:  # FIXME: cannot downsample
        for key in keys:
            print(f"{optimizer[key].shape=}")
            new_res = int(optimizer[key].shape[0] / texture_downsamplingFactor)
            print(f"{new_res=}")
            new_shape = [new_res, new_res, 3]
            optimizer[key] = dr.resize(optimizer[key], new_shape)
            scene_params.update(optimizer)
    # set starting image resolution
    render_resolution = image_res_min
    print(f"{render_resolution=}")

    follow_camera_index = 1
    follow_light_index = 0
    follow_sample_index = follow_camera_index * numlights + follow_light_index
    losses = []
    intermediate_images = []
    start_time = timer()
    for stage in range(num_stages):
        print(f"Stage {stage + 1:02d}")

        start_time_stage = timer()
        for it in range(num_iterations_per_stage):
            print(f"Iter {it}")
            total_loss = 0.0
            images = []

            start_local = timer()
            for i, sample_infos in enumerate(dataset_infos):

                start_sample = timer()

                scene_params = mi.traverse(scene)
                scene_params['sensor.x_fov'] = sample_infos['sensor_fov']
                scene_params['sensor.to_world'] = sample_infos['sensor_to_world']
                scene_params['emitter.position'] = sample_infos['emitter_position']
                scene_params.update()

                image = mi.render(scene, scene_params)  # passing params is important here

                image_true = images_true[i]

                if (i == follow_sample_index) and (output_path is not None):
                    title = f"comparison_stage{stage:02d}_iteration{it:02d}_camera{follow_camera_index:03d}_light{follow_light_index:03d}"
                    filepath = os.path.join(output_path, title + ".jpg")
                    compare_images(image, title, images_true[follow_sample_index], 'Reference', filepath)

                # compute loss
                loss = dr.mean(dr.abs(image[:, :, 0:3] - image_true[:,:,0:3]))  # TODO: add mask
                # print(f"loss: {loss[0]}")
                # dr.backward(loss, dr.ADFlag.ClearNone)
                # dr.backward_from(loss, dr.ADFlag.ClearEdges)
                # dr.backward_from(loss, dr.ADFlag.ClearNone)
                dr.backward(loss)
                total_loss += loss[0]

                # Store images at the end of every stage
                if it == num_iterations_per_stage - 1:
                    dr.eval(image)
                    images.append(image)

                if i % 40 == 0:
                    print(f"elapsed time sample {i}: {timer() - start_sample}")

            losses.append(total_loss)
            optimizer.step()
            scene_params.update(optimizer)
            print(f"total loss: {total_loss}")

            print(f"elapsed time to render dataset: {timer() - start_local}")

        brdf_params = [scene_params[key] for key in keys]

        # show images
        if True:
            sample_infos = dataset_infos[0]

            scene_params = mi.traverse(scene)
            scene_params['sensor.x_fov'] = sample_infos['sensor_fov']
            scene_params['sensor.to_world'] = sample_infos['sensor_to_world']
            scene_params['emitter.position'] = sample_infos['emitter_position']
            scene_params.update()

            image = mi.render(scene, scene_params)  # passing params is important here

            filepath = os.path.join(output_path, f"comparison_stage{stage:02d}.jpg")
            compare_images(image, f"stage{stage:02d}", images_true[0], 'Reference', filepath)
            # plot_list(images, title, filepath)

        # store history
        if output_path is not None:
            filepath = os.path.join(output_path, f"history_stage{stage:02d}.csv")
            np.savetxt(filepath, np.array(losses), delimiter=",")

        intermediate_images.append(images[0:7])

        # TODO: Upsample at every stage?

        elapsed_time_stage = timer() - start_time_stage
        print('elapsed time for this stage:', elapsed_time_stage)

    elapsed_time_total = timer() - start_time
    print('total elapsed time:', elapsed_time_total)

    # show final result
    plt.clf()
    plt.figure()  # resets the size
    plt.plot(np.arange(len(losses)), losses)
    plt.xlabel("step")
    plt.ylabel("loss")
    filepath = os.path.join(output_path, "history.jpg")
    plt.tight_layout()
    if output_path is not None:
        plt.savefig(filepath)
    plt.show()

    final_images = []
    for i, sample_infos in enumerate(dataset_infos):
        scene_params["sensor.x_fov"] = sample_infos['sensor_fov']
        scene_params["sensor.to_world"] = sample_infos['sensor_to_world']
        scene_params["emitter.position"] = sample_infos['emitter_position']
        image = mi.render(scene, scene_params)
        final_images.append(image)
    for stage, inter in enumerate(intermediate_images):
        title = f"stage: {stage}, "
        filepath = os.path.join(output_path, f"stage{stage:02d}.jpg") if output_path is not None else None
        plot_list(inter, title, filepath)

    title = f"Final: "
    filepath = os.path.join(output_path, "final.jpg") if output_path is not None else None
    plot_list(final_images, title, filepath)

    title = f"Reference: "
    filepath = os.path.join(output_path, "reference.jpg") if output_path is not None else None
    plot_list(images_true, title, filepath)

    fig, axs = plt.subplots(1, 2, figsize=(10, 4))
    axs[0].imshow(mi.util.convert_to_bitmap(final_images[0]))
    axs[0].set_title('Reconstructed')
    axs[0].axis('off')
    axs[1].imshow(mi.util.convert_to_bitmap(images_true[0]))
    axs[1].set_title('Reference')
    axs[1].axis('off')
    filepath = os.path.join(output_path, "comparison_final.jpg")
    plt.tight_layout()
    if output_path is not None:
        plt.savefig(filepath)
    plt.show()

def main():
    output_root_dir = "outputs"
    optimize_svbrdf(output_root_dir)

if __name__ == "__main__":
    main()

njroussel commented 1 year ago

I ran you script on both a Linux and Windows machine with the latest master commit without any issues (image_gradual_upsampling was not defined, so I set it to False).

Before anything else I would recommend double-checking your environment. Explicitly uninstall mitsuba and drjit before installing newer versions. As a sanity check, put a simple print somewhere in the source that you can trigger - just to confirm that you're indeed running a local build.

Adding the mitsuba and python folders into a python env did not work.

This is not a good sign. You should always be able to to append the python build path to sys.path.

osylum commented 1 year ago

Thanks for testing.

Sorry, yes, I had removed the image_gradual_upsampling code part; I think the variable was still active in my pycharm project though. It was False, so it's fine. I corrected the code above in case people will try to run it.

I am quite puzzled. I created the python environment from scratch and the wheel was generated from the mitsuba build, so it can't be another (mitsuba,drjit) than the one from the build that is used in the python env.

For your last comment, I actually meant: Adding the mitsuba and drjit folders from the compiled build into the Lib/site-packages of the python environment did not work (when running the python file, it triggered an error with drjit init), as compared to using the wheel.

njroussel commented 1 year ago

I see.

I'm not omitting the possibility that we have another bug that just happens to have the exact same failure behaviour as the one we recently fixed.

Basically this single line: https://github.com/mitsuba-renderer/drjit-core/pull/58/files should be in your source (the comment has slightly changed).

Could you share the entire LLVM IR ?

osylum commented 1 year ago

It seems I have this change in the code:

What is "entire LLVM IR"?

njroussel commented 1 year ago

Oh sorry, that wasn't clear. In your very fist message, you posted a reduced version of the full error message which ended with:

drjit_4239a778091d697a3bfdf1c536306938:1890:23: error: use of undefined value '%f0'
    store <8 x float> %f0, ptr %f0_p5, align 32, !noalias !2, !nontemporal !3

I would need to see everthing that is printed after the message: Critical Dr.Jit compiler failure: jit_llvm_compile(): parsing failed. Please see the LLVM IR and error message below:

osylum commented 1 year ago

I tried to add the output in the github comment but it is too long. I attached it here.

let me know how to put it on github.

From: Nicolas Roussel @.> Sent: April 20, 2023 5:36 PM To: mitsuba-renderer/mitsuba3 @.> Cc: Pascal Clausen @.>; Mention @.> Subject: Re: [mitsuba-renderer/mitsuba3] code for the optimization of svbrdf textures crashes with both llvm and cuda variants after hundreds of renders. (Issue #662)

Oh sorry, that wasn't clear. In your very fist message, you posted a reduced version of the full error message which ended with:

drjit_4239a778091d697a3bfdf1c536306938:1890:23: error: use of undefined value '%f0' store <8 x float> %f0, ptr %f0_p5, align 32, !noalias !2, !nontemporal !3

I would need to see everthing that is printed after the message: Critical Dr.Jit compiler failure: jit_llvm_compile(): parsing failed. Please see the LLVM IR and error message below:

— Reply to this email directly, view it on GitHubhttps://github.com/mitsuba-renderer/mitsuba3/issues/662#issuecomment-1516547805, or unsubscribehttps://github.com/notifications/unsubscribe-auth/A55IEYO2LV33DQABG2W4FULXCFJZNANCNFSM6AAAAAAW6H42IE. You are receiving this because you were mentioned.Message ID: @.***>

I see. Here the full output:


C:\Users\pascal.clausen\venv\test_optimize_svbrdf\Scripts\python.exe C:\Users\pascal.clausen\wkspaces\DevPlastic9\RND_Relighting_Code\SceneIndependent\LearningBRDF\tests_mitsuba\che2020\learning\test_mitsuba_optimize_svbrdf.py 
available mitsuba variants: ['scalar_rgb', 'scalar_spectral', 'cuda_ad_rgb', 'llvm_ad_rgb']
np.shape(directions)=(10, 3)
np.shape(positions)=(10, 3)
number of samples in dataset: 100
  bsdf_type  ...                              emitter_direction
0    disney  ...  [0.8660254037844386, 0.0, 0.5000000000000001]
1    disney  ...  [0.8660254037844386, 0.0, 0.5000000000000001]
2    disney  ...  [0.8660254037844386, 0.0, 0.5000000000000001]
3    disney  ...  [0.8660254037844386, 0.0, 0.5000000000000001]
4    disney  ...  [0.8660254037844386, 0.0, 0.5000000000000001]

[5 rows x 9 columns]
output path: outputs\20230420-215722
exporting textures to textures
scene params: SceneParameters[
  ----------------------------------------------------------------------------------------------------
  Name                                             Flags    Type  Parent
  ----------------------------------------------------------------------------------------------------
  emitter.position                                          Point3f PointLight
  emitter.intensity.value                          ∂        Float UniformSpectrum
  object.bsdf.clearcoat.value                      ∂        Float UniformSpectrum
  object.bsdf.clearcoat_gloss.value                ∂        Float UniformSpectrum
  object.bsdf.metallic.data                        ∂        TensorXf BitmapTexture
  object.bsdf.metallic.to_uv                                ScalarTransform3f BitmapTexture
  object.bsdf.main_specular_sampling_rate                   float Principled
  object.bsdf.clearcoat_sampling_rate                       float Principled
  object.bsdf.diffuse_reflectance_sampling_rate             float Principled
  object.bsdf.specular                             ∂, D     Float Principled
  object.bsdf.roughness.data                       ∂, D     TensorXf BitmapTexture
  object.bsdf.roughness.to_uv                      , D      ScalarTransform3f BitmapTexture
  object.bsdf.base_color.data                      ∂        TensorXf BitmapTexture
  object.bsdf.base_color.to_uv                              ScalarTransform3f BitmapTexture
  object.bsdf.anisotropic.value                    ∂        Float UniformSpectrum
  object.bsdf.spec_tint.value                      ∂        Float UniformSpectrum
  object.bsdf.sheen.value                          ∂        Float UniformSpectrum
  object.bsdf.sheen_tint.value                     ∂        Float UniformSpectrum
  object.bsdf.spec_trans.value                     ∂        Float UniformSpectrum
  object.bsdf.flatness.value                       ∂        Float UniformSpectrum
  object.to_world                                  ∂, D     Transform4f Sphere
  sensor.near_clip                                          float PerspectiveCamera
  sensor.far_clip                                           float PerspectiveCamera
  sensor.shutter_open                                       float PerspectiveCamera
  sensor.shutter_open_time                                  float PerspectiveCamera
  sensor.film.size                                          ScalarVector2u HDRFilm
  sensor.film.crop_size                                     ScalarVector2u HDRFilm
  sensor.film.crop_offset                                   ScalarPoint2u HDRFilm
  sensor.x_fov                                     ∂, D     Float PerspectiveCamera
  sensor.to_world                                  ∂, D     Transform4f PerspectiveCamera
]
SceneParameters[
  ----------------------------------------------------------------------------------------------------
  Name                                             Flags    Type  Parent
  ----------------------------------------------------------------------------------------------------
  emitter.position                                          Point3f PointLight
  emitter.intensity.value                          ∂        Float UniformSpectrum
  object.bsdf.clearcoat.value                      ∂        Float UniformSpectrum
  object.bsdf.clearcoat_gloss.value                ∂        Float UniformSpectrum
  object.bsdf.metallic.data                        ∂        TensorXf BitmapTexture
  object.bsdf.metallic.to_uv                                ScalarTransform3f BitmapTexture
  object.bsdf.main_specular_sampling_rate                   float Principled
  object.bsdf.clearcoat_sampling_rate                       float Principled
  object.bsdf.diffuse_reflectance_sampling_rate             float Principled
  object.bsdf.specular                             ∂, D     Float Principled
  object.bsdf.roughness.data                       ∂, D     TensorXf BitmapTexture
  object.bsdf.roughness.to_uv                      , D      ScalarTransform3f BitmapTexture
  object.bsdf.base_color.data                      ∂        TensorXf BitmapTexture
  object.bsdf.base_color.to_uv                              ScalarTransform3f BitmapTexture
  object.bsdf.anisotropic.value                    ∂        Float UniformSpectrum
  object.bsdf.spec_tint.value                      ∂        Float UniformSpectrum
  object.bsdf.sheen.value                          ∂        Float UniformSpectrum
  object.bsdf.sheen_tint.value                     ∂        Float UniformSpectrum
  object.bsdf.spec_trans.value                     ∂        Float UniformSpectrum
  object.bsdf.flatness.value                       ∂        Float UniformSpectrum
  object.to_world                                  ∂, D     Transform4f Sphere
  sensor.near_clip                                          float PerspectiveCamera
  sensor.far_clip                                           float PerspectiveCamera
  sensor.shutter_open                                       float PerspectiveCamera
  sensor.shutter_open_time                                  float PerspectiveCamera
  sensor.film.size                                          ScalarVector2u HDRFilm
  sensor.film.crop_size                                     ScalarVector2u HDRFilm
  sensor.film.crop_offset                                   ScalarPoint2u HDRFilm
  sensor.x_fov                                     ∂, D     Float PerspectiveCamera
  sensor.to_world                                  ∂, D     Transform4f PerspectiveCamera
]
loading reference images
number of images in dataset: 100
elapsed time: 1.1540365000000001
done loading reference images
key='object.bsdf.base_color.data'
key='object.bsdf.roughness.data'
key='object.bsdf.metallic.data'
render_resolution=array([256, 256])
Stage 01
Iter 0
elapsed time sample 0: 0.044426500000000146
elapsed time sample 40: 0.03169939999999993
elapsed time sample 80: 0.03342480000000059

Critical Dr.Jit compiler failure: jit_llvm_compile(): parsing failed. Please see the LLVM IR and error message below:

define void @drjit_32d2445094dccf0aa609267382b86763(i64 %start, i64 %end, ptr noalias %params) #0 {
entry:
    %callables = load ptr, ptr @callables, align 8
    %buffer = alloca i8, i32 736, align 32
    br label %body

body:
    %index = phi i64 [ %index_next, %suffix ], [ %start, %entry ]
    %f1_1 = insertelement <8 x float> undef, float 0xc012bffd00000000, i32 0
    %f1 = shufflevector <8 x float> %f1_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f2_1 = insertelement <8 x float> undef, float 0x403e9dbd60000000, i32 0
    %f2 = shufflevector <8 x float> %f2_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f3_1 = insertelement <8 x float> undef, float 0xc05657e540000000, i32 0
    %f3 = shufflevector <8 x float> %f3_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f4_1 = insertelement <8 x float> undef, float 0x40634c5d40000000, i32 0
    %f4 = shufflevector <8 x float> %f4_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f5_1 = insertelement <8 x float> undef, float 0xc06604b8a0000000, i32 0
    %f5 = shufflevector <8 x float> %f5_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f6_1 = insertelement <8 x float> undef, float 0x4061810320000000, i32 0
    %f6 = shufflevector <8 x float> %f6_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f7_1 = insertelement <8 x float> undef, float 0xc053c9afe0000000, i32 0
    %f7 = shufflevector <8 x float> %f7_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f8_1 = insertelement <8 x float> undef, float 0x403f4a20c0000000, i32 0
    %f8 = shufflevector <8 x float> %f8_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f9_1 = insertelement <8 x float> undef, float 0xc01fda5bc0000000, i32 0
    %f9 = shufflevector <8 x float> %f9_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f10_1 = insertelement <8 x float> undef, float 0x3feff9d520000000, i32 0
    %f10 = shufflevector <8 x float> %f10_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f11_p1 = getelementptr inbounds ptr, ptr %params, i32 3
    %f11_p3 = load ptr, ptr %f11_p1, align 8, !alias.scope !2
    %f11_0 = load float, ptr %f11_p3, align 4, !alias.scope !2
    %f11_1 = insertelement <8 x float> undef, float %f11_0, i32 0
    %f11 = shufflevector <8 x float> %f11_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f12_p1 = getelementptr inbounds ptr, ptr %params, i32 4
    %f12_p3 = load ptr, ptr %f12_p1, align 8, !alias.scope !2
    %f12_0 = load float, ptr %f12_p3, align 4, !alias.scope !2
    %f12_1 = insertelement <8 x float> undef, float %f12_0, i32 0
    %f12 = shufflevector <8 x float> %f12_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f13_1 = insertelement <8 x float> undef, float 0x3cf6564860000000, i32 0
    %f13 = shufflevector <8 x float> %f13_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f14_1 = insertelement <8 x float> undef, float 0xbca1a62640000000, i32 0
    %f14 = shufflevector <8 x float> %f14_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f15_p1 = getelementptr inbounds ptr, ptr %params, i32 5
    %f15_p3 = load ptr, ptr %f15_p1, align 8, !alias.scope !2
    %f15_0 = load float, ptr %f15_p3, align 4, !alias.scope !2
    %f15_1 = insertelement <8 x float> undef, float %f15_0, i32 0
    %f15 = shufflevector <8 x float> %f15_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f16_p1 = getelementptr inbounds ptr, ptr %params, i32 6
    %f16_p3 = load ptr, ptr %f16_p1, align 8, !alias.scope !2
    %f16_0 = load float, ptr %f16_p3, align 4, !alias.scope !2
    %f16_1 = insertelement <8 x float> undef, float %f16_0, i32 0
    %f16 = shufflevector <8 x float> %f16_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f17_p1 = getelementptr inbounds ptr, ptr %params, i32 7
    %f17_p3 = load ptr, ptr %f17_p1, align 8, !alias.scope !2
    %f17_0 = load float, ptr %f17_p3, align 4, !alias.scope !2
    %f17_1 = insertelement <8 x float> undef, float %f17_0, i32 0
    %f17 = shufflevector <8 x float> %f17_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f18_p1 = getelementptr inbounds ptr, ptr %params, i32 8
    %f18_p3 = load ptr, ptr %f18_p1, align 8, !alias.scope !2
    %f18_0 = load float, ptr %f18_p3, align 4, !alias.scope !2
    %f18_1 = insertelement <8 x float> undef, float %f18_0, i32 0
    %f18 = shufflevector <8 x float> %f18_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f19_p1 = getelementptr inbounds ptr, ptr %params, i32 9
    %f19_p3 = load ptr, ptr %f19_p1, align 8, !alias.scope !2
    %f19_0 = load float, ptr %f19_p3, align 4, !alias.scope !2
    %f19_1 = insertelement <8 x float> undef, float %f19_0, i32 0
    %f19 = shufflevector <8 x float> %f19_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f20_p1 = getelementptr inbounds ptr, ptr %params, i32 10
    %f20_p3 = load ptr, ptr %f20_p1, align 8, !alias.scope !2
    %f20_0 = load float, ptr %f20_p3, align 4, !alias.scope !2
    %f20_1 = insertelement <8 x float> undef, float %f20_0, i32 0
    %f20 = shufflevector <8 x float> %f20_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f21_p1 = getelementptr inbounds ptr, ptr %params, i32 11
    %f21_p3 = load ptr, ptr %f21_p1, align 8, !alias.scope !2
    %f21_0 = load float, ptr %f21_p3, align 4, !alias.scope !2
    %f21_1 = insertelement <8 x float> undef, float %f21_0, i32 0
    %f21 = shufflevector <8 x float> %f21_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f22_p1 = getelementptr inbounds ptr, ptr %params, i32 12
    %f22_p3 = load ptr, ptr %f22_p1, align 8, !alias.scope !2
    %f22_0 = load float, ptr %f22_p3, align 4, !alias.scope !2
    %f22_1 = insertelement <8 x float> undef, float %f22_0, i32 0
    %f22 = shufflevector <8 x float> %f22_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f23_p1 = getelementptr inbounds ptr, ptr %params, i32 13
    %f23_p3 = load ptr, ptr %f23_p1, align 8, !alias.scope !2
    %f23_0 = load float, ptr %f23_p3, align 4, !alias.scope !2
    %f23_1 = insertelement <8 x float> undef, float %f23_0, i32 0
    %f23 = shufflevector <8 x float> %f23_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f24_p1 = getelementptr inbounds ptr, ptr %params, i32 14
    %f24_p3 = load ptr, ptr %f24_p1, align 8, !alias.scope !2
    %f24_0 = load float, ptr %f24_p3, align 4, !alias.scope !2
    %f24_1 = insertelement <8 x float> undef, float %f24_0, i32 0
    %f24 = shufflevector <8 x float> %f24_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f25_p1 = getelementptr inbounds ptr, ptr %params, i32 15
    %f25_p3 = load ptr, ptr %f25_p1, align 8, !alias.scope !2
    %f25_0 = load float, ptr %f25_p3, align 4, !alias.scope !2
    %f25_1 = insertelement <8 x float> undef, float %f25_0, i32 0
    %f25 = shufflevector <8 x float> %f25_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f26_p1 = getelementptr inbounds ptr, ptr %params, i32 16
    %f26_p3 = load ptr, ptr %f26_p1, align 8, !alias.scope !2
    %f26_0 = load float, ptr %f26_p3, align 4, !alias.scope !2
    %f26_1 = insertelement <8 x float> undef, float %f26_0, i32 0
    %f26 = shufflevector <8 x float> %f26_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f27_p1 = getelementptr inbounds ptr, ptr %params, i32 17
    %f27_p3 = load ptr, ptr %f27_p1, align 8, !alias.scope !2
    %f27_0 = load float, ptr %f27_p3, align 4, !alias.scope !2
    %f27_1 = insertelement <8 x float> undef, float %f27_0, i32 0
    %f27 = shufflevector <8 x float> %f27_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f28_p1 = getelementptr inbounds ptr, ptr %params, i32 18
    %f28_p3 = load ptr, ptr %f28_p1, align 8, !alias.scope !2
    %f28_0 = load float, ptr %f28_p3, align 4, !alias.scope !2
    %f28_1 = insertelement <8 x float> undef, float %f28_0, i32 0
    %f28 = shufflevector <8 x float> %f28_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f29_p1 = getelementptr inbounds ptr, ptr %params, i32 19
    %f29_p3 = load ptr, ptr %f29_p1, align 8, !alias.scope !2
    %f29_0 = load float, ptr %f29_p3, align 4, !alias.scope !2
    %f29_1 = insertelement <8 x float> undef, float %f29_0, i32 0
    %f29 = shufflevector <8 x float> %f29_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f30_p1 = getelementptr inbounds ptr, ptr %params, i32 20
    %f30_p3 = load ptr, ptr %f30_p1, align 8, !alias.scope !2
    %f30_0 = load float, ptr %f30_p3, align 4, !alias.scope !2
    %f30_1 = insertelement <8 x float> undef, float %f30_0, i32 0
    %f30 = shufflevector <8 x float> %f30_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f31_1 = insertelement <8 x float> undef, float 0x0, i32 0
    %f31 = shufflevector <8 x float> %f31_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f32_1 = insertelement <8 x float> undef, float 0xbff0000000000000, i32 0
    %f32 = shufflevector <8 x float> %f32_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f33_1 = insertelement <8 x float> undef, float 0xc044400000000000, i32 0
    %f33 = shufflevector <8 x float> %f33_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f34_1 = insertelement <8 x float> undef, float 0x3ca1a62640000000, i32 0
    %f34 = shufflevector <8 x float> %f34_1, <8 x float> undef, <8 x i32> zeroinitializer
    %rd35_p1 = getelementptr inbounds ptr, ptr %params, i32 21
    %rd35 = load ptr, ptr %rd35_p1, align 8, !alias.scope !2
    %rd36_p1 = getelementptr inbounds ptr, ptr %params, i32 22
    %rd36 = load ptr, ptr %rd36_p1, align 8, !alias.scope !2
    %p37_1 = insertelement <8 x i1> undef, i1 1, i32 0
    %p37 = shufflevector <8 x i1> %p37_1, <8 x i1> undef, <8 x i32> zeroinitializer
    %r38_0 = trunc i64 %index to i32
    %r38_1 = insertelement <8 x i32> undef, i32 %r38_0, i32 0
    %r38_2 = shufflevector <8 x i32> %r38_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r38 = add <8 x i32> %r38_2, 
    %p39_0 = trunc i64 %end to i32
    %p39_1 = insertelement <8 x i32> undef, i32 %p39_0, i32 0
    %p39_2 = shufflevector <8 x i32> %p39_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p39 = icmp ult <8 x i32> %r38, %p39_2
    %r40_1 = insertelement <8 x i32> undef, i32 4294967295, i32 0
    %r40 = shufflevector <8 x i32> %r40_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r41_1 = insertelement <8 x i32> undef, i32 0, i32 0
    %r41 = shufflevector <8 x i32> %r41_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r42 = select <8 x i1> %p39, <8 x i32> %r40, <8 x i32> %r41
    %f43_1 = insertelement <8 x float> undef, float 0x0, i32 0
    %f43 = shufflevector <8 x float> %f43_1, <8 x float> undef, <8 x i32> zeroinitializer
    %r44_p1 = getelementptr inbounds ptr, ptr %params, i32 23
    %r44_p3 = load ptr, ptr %r44_p1, align 8, !alias.scope !2
    %r44_0 = load i32, ptr %r44_p3, align 4, !alias.scope !2
    %r44_1 = insertelement <8 x i32> undef, i32 %r44_0, i32 0
    %r44 = shufflevector <8 x i32> %r44_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r45 = lshr <8 x i32> %r38, %r44
    %r46_1 = insertelement <8 x i32> undef, i32 8, i32 0
    %r46 = shufflevector <8 x i32> %r46_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r47 = lshr <8 x i32> %r45, %r46
    %f48 = uitofp <8 x i32> %r47 to <8 x float>
    %rd49_1 = insertelement <8 x i64> undef, i64 0, i32 0
    %rd49 = shufflevector <8 x i64> %rd49_1, <8 x i64> undef, <8 x i32> zeroinitializer
    %rd50_1 = insertelement <8 x i64> undef, i64 6364136223846793005, i32 0
    %rd50 = shufflevector <8 x i64> %rd50_1, <8 x i64> undef, <8 x i32> zeroinitializer
    %r51_p1 = getelementptr inbounds ptr, ptr %params, i32 24
    %r51_p3 = load ptr, ptr %r51_p1, align 8, !alias.scope !2
    %r51_0 = load i32, ptr %r51_p3, align 4, !alias.scope !2
    %r51_1 = insertelement <8 x i32> undef, i32 %r51_0, i32 0
    %r51 = shufflevector <8 x i32> %r51_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r52_1 = insertelement <8 x i32> undef, i32 4, i32 0
    %r52 = shufflevector <8 x i32> %r52_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r53 = shl <8 x i32> %r38, %r52
    %r54_1 = insertelement <8 x i32> undef, i32 2738958700, i32 0
    %r54 = shufflevector <8 x i32> %r54_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r55 = add <8 x i32> %r53, %r54
    %r56_1 = insertelement <8 x i32> undef, i32 2654435769, i32 0
    %r56 = shufflevector <8 x i32> %r56_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r57 = add <8 x i32> %r38, %r56
    %r58 = xor <8 x i32> %r55, %r57
    %r59_1 = insertelement <8 x i32> undef, i32 5, i32 0
    %r59 = shufflevector <8 x i32> %r59_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r60 = lshr <8 x i32> %r38, %r59
    %r61_1 = insertelement <8 x i32> undef, i32 3355524772, i32 0
    %r61 = shufflevector <8 x i32> %r61_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r62 = add <8 x i32> %r60, %r61
    %r63 = xor <8 x i32> %r58, %r62
    %r64 = add <8 x i32> %r51, %r63
    %r65 = shl <8 x i32> %r64, %r52
    %r66_1 = insertelement <8 x i32> undef, i32 2911926141, i32 0
    %r66 = shufflevector <8 x i32> %r66_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r67 = add <8 x i32> %r65, %r66
    %r68 = add <8 x i32> %r64, %r56
    %r69 = xor <8 x i32> %r67, %r68
    %r70 = lshr <8 x i32> %r64, %r59
    %r71_1 = insertelement <8 x i32> undef, i32 2123724318, i32 0
    %r71 = shufflevector <8 x i32> %r71_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r72 = add <8 x i32> %r70, %r71
    %r73 = xor <8 x i32> %r69, %r72
    %r74 = add <8 x i32> %r38, %r73
    %r75 = shl <8 x i32> %r74, %r52
    %r76 = add <8 x i32> %r75, %r54
    %r77_1 = insertelement <8 x i32> undef, i32 1013904242, i32 0
    %r77 = shufflevector <8 x i32> %r77_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r78 = add <8 x i32> %r74, %r77
    %r79 = xor <8 x i32> %r76, %r78
    %r80 = lshr <8 x i32> %r74, %r59
    %r81 = add <8 x i32> %r80, %r61
    %r82 = xor <8 x i32> %r79, %r81
    %r83 = add <8 x i32> %r64, %r82
    %r84 = shl <8 x i32> %r83, %r52
    %r85 = add <8 x i32> %r84, %r66
    %r86 = add <8 x i32> %r83, %r77
    %r87 = xor <8 x i32> %r85, %r86
    %r88 = lshr <8 x i32> %r83, %r59
    %r89 = add <8 x i32> %r88, %r71
    %r90 = xor <8 x i32> %r87, %r89
    %r91 = add <8 x i32> %r74, %r90
    %r92 = shl <8 x i32> %r91, %r52
    %r93 = add <8 x i32> %r92, %r54
    %r94_1 = insertelement <8 x i32> undef, i32 3668340011, i32 0
    %r94 = shufflevector <8 x i32> %r94_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r95 = add <8 x i32> %r91, %r94
    %r96 = xor <8 x i32> %r93, %r95
    %r97 = lshr <8 x i32> %r91, %r59
    %r98 = add <8 x i32> %r97, %r61
    %r99 = xor <8 x i32> %r96, %r98
    %r100 = add <8 x i32> %r83, %r99
    %r101 = shl <8 x i32> %r100, %r52
    %r102 = add <8 x i32> %r101, %r66
    %r103 = add <8 x i32> %r100, %r94
    %r104 = xor <8 x i32> %r102, %r103
    %r105 = lshr <8 x i32> %r100, %r59
    %r106 = add <8 x i32> %r105, %r71
    %r107 = xor <8 x i32> %r104, %r106
    %r108 = add <8 x i32> %r91, %r107
    %r109 = shl <8 x i32> %r108, %r52
    %r110 = add <8 x i32> %r109, %r54
    %r111_1 = insertelement <8 x i32> undef, i32 2027808484, i32 0
    %r111 = shufflevector <8 x i32> %r111_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r112 = add <8 x i32> %r108, %r111
    %r113 = xor <8 x i32> %r110, %r112
    %r114 = lshr <8 x i32> %r108, %r59
    %r115 = add <8 x i32> %r114, %r61
    %r116 = xor <8 x i32> %r113, %r115
    %r117 = add <8 x i32> %r100, %r116
    %r118 = shl <8 x i32> %r117, %r52
    %r119 = add <8 x i32> %r118, %r66
    %r120 = add <8 x i32> %r117, %r111
    %r121 = xor <8 x i32> %r119, %r120
    %r122 = lshr <8 x i32> %r117, %r59
    %r123 = add <8 x i32> %r122, %r71
    %r124 = xor <8 x i32> %r121, %r123
    %r125 = add <8 x i32> %r108, %r124
    %rd126 = zext <8 x i32> %r125 to <8 x i64>
    %rd127_1 = insertelement <8 x i64> undef, i64 1, i32 0
    %rd127 = shufflevector <8 x i64> %rd127_1, <8 x i64> undef, <8 x i32> zeroinitializer
    %rd128 = shl <8 x i64> %rd126, %rd127
    %rd129 = or <8 x i64> %rd128, %rd127
    %rd130_0 = mul <8 x i64> %rd49, %rd50
    %rd130 = add <8 x i64> %rd130_0, %rd129
    %rd131 = zext <8 x i32> %r117 to <8 x i64>
    %rd132 = add <8 x i64> %rd130, %rd131
    %rd133_0 = mul <8 x i64> %rd132, %rd50
    %rd133 = add <8 x i64> %rd133_0, %rd129
    %rd134_0 = mul <8 x i64> %rd133, %rd50
    %rd134 = add <8 x i64> %rd134_0, %rd129
    %rd135_1 = insertelement <8 x i64> undef, i64 18, i32 0
    %rd135 = shufflevector <8 x i64> %rd135_1, <8 x i64> undef, <8 x i32> zeroinitializer
    %rd136 = lshr <8 x i64> %rd134, %rd135
    %rd137 = xor <8 x i64> %rd136, %rd134
    %rd138_1 = insertelement <8 x i64> undef, i64 27, i32 0
    %rd138 = shufflevector <8 x i64> %rd138_1, <8 x i64> undef, <8 x i32> zeroinitializer
    %rd139 = lshr <8 x i64> %rd137, %rd138
    %r140 = trunc <8 x i64> %rd139 to <8 x i32>
    %rd141_1 = insertelement <8 x i64> undef, i64 59, i32 0
    %rd141 = shufflevector <8 x i64> %rd141_1, <8 x i64> undef, <8 x i32> zeroinitializer
    %rd142 = lshr <8 x i64> %rd134, %rd141
    %r143 = trunc <8 x i64> %rd142 to <8 x i32>
    %r144 = lshr <8 x i32> %r140, %r143
    %r145 = bitcast <8 x i32> %r143 to <8 x i32>
    %r146 = sub <8 x i32> zeroinitializer, %r145
    %r147_1 = insertelement <8 x i32> undef, i32 31, i32 0
    %r147 = shufflevector <8 x i32> %r147_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r148 = and <8 x i32> %r146, %r147
    %r149 = bitcast <8 x i32> %r148 to <8 x i32>
    %r150 = shl <8 x i32> %r140, %r149
    %r151 = or <8 x i32> %r144, %r150
    %r152_1 = insertelement <8 x i32> undef, i32 9, i32 0
    %r152 = shufflevector <8 x i32> %r152_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r153 = lshr <8 x i32> %r151, %r152
    %r154_1 = insertelement <8 x i32> undef, i32 1065353216, i32 0
    %r154 = shufflevector <8 x i32> %r154_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r155 = or <8 x i32> %r153, %r154
    %f156 = bitcast <8 x i32> %r155 to <8 x float>
    %f157_1 = insertelement <8 x float> undef, float 0x3ff0000000000000, i32 0
    %f157 = shufflevector <8 x float> %f157_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f158 = fsub <8 x float> %f156, %f157
    %f159 = fadd <8 x float> %f48, %f158
    %f160_1 = insertelement <8 x float> undef, float 0x3f70000000000000, i32 0
    %f160 = shufflevector <8 x float> %f160_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f161_1 = insertelement <8 x float> undef, float 0x8000000000000000, i32 0
    %f161 = shufflevector <8 x float> %f161_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f162 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f159, <8 x float> %f160, <8 x float> %f161)
    %f163_1 = insertelement <8 x float> undef, float 0x4070000000000000, i32 0
    %f163 = shufflevector <8 x float> %f163_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f164 = fmul <8 x float> %f163, %f11
    %f165 = fmul <8 x float> %f164, %f160
    %f166 = fadd <8 x float> %f162, %f165
    %r167_1 = insertelement <8 x i32> undef, i32 256, i32 0
    %r167 = shufflevector <8 x i32> %r167_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r168 = sub <8 x i32> zeroinitializer, %r47
    %r169_0 = mul <8 x i32> %r167, %r168
    %r169 = add <8 x i32> %r169_0, %r45
    %f170 = uitofp <8 x i32> %r169 to <8 x float>
    %rd171 = lshr <8 x i64> %rd133, %rd135
    %rd172 = xor <8 x i64> %rd171, %rd133
    %rd173 = lshr <8 x i64> %rd172, %rd138
    %r174 = trunc <8 x i64> %rd173 to <8 x i32>
    %rd175 = lshr <8 x i64> %rd133, %rd141
    %r176 = trunc <8 x i64> %rd175 to <8 x i32>
    %r177 = lshr <8 x i32> %r174, %r176
    %r178 = bitcast <8 x i32> %r176 to <8 x i32>
    %r179 = sub <8 x i32> zeroinitializer, %r178
    %r180 = and <8 x i32> %r179, %r147
    %r181 = bitcast <8 x i32> %r180 to <8 x i32>
    %r182 = shl <8 x i32> %r174, %r181
    %r183 = or <8 x i32> %r177, %r182
    %r184 = lshr <8 x i32> %r183, %r152
    %r185 = or <8 x i32> %r184, %r154
    %f186 = bitcast <8 x i32> %r185 to <8 x float>
    %f187 = fsub <8 x float> %f186, %f157
    %f188 = fadd <8 x float> %f170, %f187
    %f189 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f188, <8 x float> %f160, <8 x float> %f161)
    %f190 = fmul <8 x float> %f163, %f12
    %f191 = fmul <8 x float> %f190, %f160
    %f192 = fadd <8 x float> %f189, %f191
    %f193 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f17, <8 x float> %f192, <8 x float> %f18)
    %f194 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f16, <8 x float> %f166, <8 x float> %f193)
    %f195 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f15, <8 x float> %f43, <8 x float> %f194)
    %f196 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f21, <8 x float> %f192, <8 x float> %f22)
    %f197 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f20, <8 x float> %f166, <8 x float> %f196)
    %f198 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f19, <8 x float> %f43, <8 x float> %f197)
    %f199 = fdiv <8 x float> %f157, %f198
    %f200 = fmul <8 x float> %f195, %f199
    %f201 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f25, <8 x float> %f192, <8 x float> %f26)
    %f202 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f24, <8 x float> %f166, <8 x float> %f201)
    %f203 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f23, <8 x float> %f43, <8 x float> %f202)
    %f204 = fmul <8 x float> %f203, %f199
    %f205 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f29, <8 x float> %f192, <8 x float> %f30)
    %f206 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f28, <8 x float> %f166, <8 x float> %f205)
    %f207 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f27, <8 x float> %f43, <8 x float> %f206)
    %f208 = fmul <8 x float> %f207, %f199
    %f209 = fmul <8 x float> %f208, %f208
    %f210 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f204, <8 x float> %f204, <8 x float> %f209)
    %f211 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f200, <8 x float> %f200, <8 x float> %f210)
    %f212 = fdiv <8 x float> %f157, %f211
    %f213 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f212)
    %f214 = fmul <8 x float> %f200, %f213
    %f215 = fmul <8 x float> %f204, %f213
    %f216 = fmul <8 x float> %f208, %f213
    %f217 = fmul <8 x float> %f31, %f216
    %f218 = fadd <8 x float> %f215, %f217
    %f219 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f14, <8 x float> %f214, <8 x float> %f218)
    %f220_1 = insertelement <8 x float> undef, float 0x3f847ae140000000, i32 0
    %f220 = shufflevector <8 x float> %f220_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f221 = fdiv <8 x float> %f157, %f214
    %f222 = fmul <8 x float> %f220, %f221
    %f223 = fmul <8 x float> %f219, %f222
    %f224 = fadd <8 x float> %f13, %f223
    %f225 = fmul <8 x float> %f32, %f216
    %f226 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f31, <8 x float> %f215, <8 x float> %f225)
    %f227 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f31, <8 x float> %f214, <8 x float> %f226)
    %f228 = fmul <8 x float> %f227, %f222
    %f229 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f34, <8 x float> %f215, <8 x float> %f217)
    %f230 = fadd <8 x float> %f214, %f229
    %f231 = fmul <8 x float> %f230, %f222
    %f232 = fadd <8 x float> %f33, %f231
    %f233_1 = insertelement <8 x float> undef, float 0x40c3880000000000, i32 0
    %f233 = shufflevector <8 x float> %f233_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f234 = fmul <8 x float> %f233, %f221
    %f235 = fsub <8 x float> %f234, %f222
    %r236_1 = insertelement <8 x i32> undef, i32 0, i32 0
    %r236 = shufflevector <8 x i32> %r236_1, <8 x i32> undef, <8 x i32> zeroinitializer

    ; -------- Ray trace -------
    %u237_in_0_1 = getelementptr inbounds i8, ptr %buffer, i32 0
    store <8 x i32> %r42, ptr %u237_in_0_1, align 32
    %u237_in_1_1 = getelementptr inbounds i8, ptr %buffer, i32 32
    store <8 x float> %f224, ptr %u237_in_1_1, align 32
    %u237_in_2_1 = getelementptr inbounds i8, ptr %buffer, i32 64
    store <8 x float> %f228, ptr %u237_in_2_1, align 32
    %u237_in_3_1 = getelementptr inbounds i8, ptr %buffer, i32 96
    store <8 x float> %f232, ptr %u237_in_3_1, align 32
    %u237_in_4_1 = getelementptr inbounds i8, ptr %buffer, i32 128
    store <8 x float> %f43, ptr %u237_in_4_1, align 32
    %u237_in_5_1 = getelementptr inbounds i8, ptr %buffer, i32 160
    store <8 x float> %f219, ptr %u237_in_5_1, align 32
    %u237_in_6_1 = getelementptr inbounds i8, ptr %buffer, i32 192
    store <8 x float> %f227, ptr %u237_in_6_1, align 32
    %u237_in_7_1 = getelementptr inbounds i8, ptr %buffer, i32 224
    store <8 x float> %f230, ptr %u237_in_7_1, align 32
    %u237_in_8_1 = getelementptr inbounds i8, ptr %buffer, i32 256
    store <8 x float> %f43, ptr %u237_in_8_1, align 32
    %u237_in_9_1 = getelementptr inbounds i8, ptr %buffer, i32 288
    store <8 x float> %f235, ptr %u237_in_9_1, align 32
    %u237_in_10_1 = getelementptr inbounds i8, ptr %buffer, i32 320
    store <8 x i32> %r236, ptr %u237_in_10_1, align 32
    %u237_in_11_1 = getelementptr inbounds i8, ptr %buffer, i32 352
    store <8 x i32> %r236, ptr %u237_in_11_1, align 32
    %u237_in_12_1 = getelementptr inbounds i8, ptr %buffer, i32 384
    store <8 x i32> %r236, ptr %u237_in_12_1, align 32
    %u237_in_geomid_1 = getelementptr inbounds i8, ptr %buffer, i32 608
    store <8 x i32> , ptr %u237_in_geomid_1, align 32
    %u237_in_ctx_1 = getelementptr inbounds i8, ptr %buffer, i32 672
    store <6 x i32> , ptr %u237_in_ctx_1, align 4
    call void %rd35(ptr %u237_in_0_1, ptr %rd36, ptr %u237_in_ctx_1, ptr %u237_in_1_1)
    %u237_out_0_1 = getelementptr inbounds i8, ptr %buffer, i32 288
    %u237_out_0 = load <8 x float>, ptr %u237_out_0_1, align 32
    %u237_out_1_1 = getelementptr inbounds i8, ptr %buffer, i32 512
    %u237_out_1 = load <8 x float>, ptr %u237_out_1_1, align 32
    %u237_out_2_1 = getelementptr inbounds i8, ptr %buffer, i32 544
    %u237_out_2 = load <8 x float>, ptr %u237_out_2_1, align 32
    %u237_out_3_1 = getelementptr inbounds i8, ptr %buffer, i32 576
    %u237_out_3 = load <8 x i32>, ptr %u237_out_3_1, align 32
    %u237_out_4_1 = getelementptr inbounds i8, ptr %buffer, i32 608
    %u237_out_4 = load <8 x i32>, ptr %u237_out_4_1, align 32
    %u237_out_5_1 = getelementptr inbounds i8, ptr %buffer, i32 640
    %u237_out_5 = load <8 x i32>, ptr %u237_out_5_1, align 32
    ; -------------------

    %f238 = bitcast <8 x float> %u237_out_0 to <8 x float>
    %p239 = fcmp one <8 x float> %f238, %f235
    %f240_1 = insertelement <8 x float> undef, float 0x7ff0000000000000, i32 0
    %f240 = shufflevector <8 x float> %f240_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f241 = select <8 x i1> %p239, <8 x float> %f238, <8 x float> %f240
    %p242 = fcmp one <8 x float> %f241, %f240
    %r243 = bitcast <8 x i32> %u237_out_5 to <8 x i32>
    %r244_1 = insertelement <8 x i32> undef, i32 4294967295, i32 0
    %r244 = shufflevector <8 x i32> %r244_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p245 = icmp ne <8 x i32> %r243, %r244
    %p246 = and <8 x i1> %p239, %p245
    %rd247_p1 = getelementptr inbounds ptr, ptr %params, i32 25
    %rd247 = load ptr, ptr %rd247_p1, align 8, !alias.scope !2
    %r248 = bitcast <8 x i32> %u237_out_4 to <8 x i32>
    %r249 = select <8 x i1> %p246, <8 x i32> %r243, <8 x i32> %r248
    %p250 = and <8 x i1> %p39, %p239
    %r251_1 = getelementptr i32, ptr %rd247, <8 x i32> %r249
    %r251 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %r251_1, i32 4, <8 x i1> %p250, <8 x i32> zeroinitializer)
    %r252 = select <8 x i1> %p246, <8 x i32> %r251, <8 x i32> %r236
    %p253 = icmp eq <8 x i32> %r252, %r236
    %p254 = xor <8 x i1> %p246, 
    %r255 = select <8 x i1> %p254, <8 x i32> %r251, <8 x i32> %r236
    %r256 = select <8 x i1> %p253, <8 x i32> %r255, <8 x i32> %r252
    %rd257_0 = mul <8 x i64> %rd134, %rd50
    %rd257 = add <8 x i64> %rd257_0, %rd129
    %r258_0 = trunc i64 %index to i32
    %r258_1 = insertelement <8 x i32> undef, i32 %r258_0, i32 0
    %r258_2 = shufflevector <8 x i32> %r258_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r258 = add <8 x i32> %r258_2, 
    %p259_0 = trunc i64 %end to i32
    %p259_1 = insertelement <8 x i32> undef, i32 %p259_0, i32 0
    %p259_2 = shufflevector <8 x i32> %p259_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p259 = icmp ult <8 x i32> %r258, %p259_2
    %r260_1 = insertelement <8 x i32> undef, i32 0, i32 0
    %r260 = shufflevector <8 x i32> %r260_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p261 = icmp ne <8 x i32> %r256, %r260
    %p262 = and <8 x i1> %p242, %p261
    %p263 = and <8 x i1> %p259, %p262
    %rd264_p1 = getelementptr inbounds ptr, ptr %params, i32 26
    %rd264 = load ptr, ptr %rd264_p1, align 8, !alias.scope !2
    %rd265_p1 = getelementptr inbounds ptr, ptr %params, i32 27
    %rd265 = load ptr, ptr %rd265_p1, align 8, !alias.scope !2

    br label %l266_start

l266_start:
    ; VCall: mitsuba::Shape::compute_surface_interaction()
    %u266_self_ptr = getelementptr i64, ptr %rd264, <8 x i32> %r256
    %u266_self_combined = call <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr> %u266_self_ptr, i32 8, <8 x i1> %p263, <8 x i64> zeroinitializer)
    %u266_self_initial = trunc <8 x i64> %u266_self_combined to <8 x i32>
    %u266_offset_1 = lshr <8 x i64> %u266_self_combined, 
    %u266_offset = trunc <8 x i64> %u266_offset_1 to <8 x i32>
    %u266_in_0_1 = getelementptr inbounds i8, ptr %buffer, i32 0
    store <8 x float> %f224, ptr %u266_in_0_1, align 32
    %u266_in_1_1 = getelementptr inbounds i8, ptr %buffer, i32 32
    store <8 x float> %f228, ptr %u266_in_1_1, align 32
    %u266_in_2_1 = getelementptr inbounds i8, ptr %buffer, i32 64
    store <8 x float> %f232, ptr %u266_in_2_1, align 32
    %u266_in_3_1 = getelementptr inbounds i8, ptr %buffer, i32 96
    store <8 x float> %f219, ptr %u266_in_3_1, align 32
    %u266_in_4_1 = getelementptr inbounds i8, ptr %buffer, i32 128
    store <8 x float> %f227, ptr %u266_in_4_1, align 32
    %u266_in_5_1 = getelementptr inbounds i8, ptr %buffer, i32 160
    store <8 x float> %f230, ptr %u266_in_5_1, align 32
    %u266_in_6_1 = getelementptr inbounds i8, ptr %buffer, i32 192
    store <8 x float> %f241, ptr %u266_in_6_1, align 32
    %u266_out = getelementptr i8, ptr %buffer, i32 224
    %u266_tmp_0_1 = getelementptr inbounds i8, ptr %u266_out, i64 0
    store <8 x float> zeroinitializer, ptr %u266_tmp_0_1, align 32
    %u266_tmp_1_1 = getelementptr inbounds i8, ptr %u266_out, i64 32
    store <8 x float> zeroinitializer, ptr %u266_tmp_1_1, align 32
    %u266_tmp_2_1 = getelementptr inbounds i8, ptr %u266_out, i64 64
    store <8 x float> zeroinitializer, ptr %u266_tmp_2_1, align 32
    %u266_tmp_3_1 = getelementptr inbounds i8, ptr %u266_out, i64 96
    store <8 x float> zeroinitializer, ptr %u266_tmp_3_1, align 32
    %u266_tmp_4_1 = getelementptr inbounds i8, ptr %u266_out, i64 128
    store <8 x float> zeroinitializer, ptr %u266_tmp_4_1, align 32
    %u266_tmp_5_1 = getelementptr inbounds i8, ptr %u266_out, i64 160
    store <8 x float> zeroinitializer, ptr %u266_tmp_5_1, align 32
    %u266_tmp_6_1 = getelementptr inbounds i8, ptr %u266_out, i64 192
    store <8 x float> zeroinitializer, ptr %u266_tmp_6_1, align 32
    %u266_tmp_7_1 = getelementptr inbounds i8, ptr %u266_out, i64 224
    store <8 x float> zeroinitializer, ptr %u266_tmp_7_1, align 32
    %u266_tmp_8_1 = getelementptr inbounds i8, ptr %u266_out, i64 256
    store <8 x i32> zeroinitializer, ptr %u266_tmp_8_1, align 32
    %u266_tmp_9_1 = getelementptr inbounds i8, ptr %u266_out, i64 288
    store <8 x float> zeroinitializer, ptr %u266_tmp_9_1, align 32
    %u266_tmp_10_1 = getelementptr inbounds i8, ptr %u266_out, i64 320
    store <8 x float> zeroinitializer, ptr %u266_tmp_10_1, align 32
    %u266_tmp_11_1 = getelementptr inbounds i8, ptr %u266_out, i64 352
    store <8 x float> zeroinitializer, ptr %u266_tmp_11_1, align 32
    %u266_tmp_12_1 = getelementptr inbounds i8, ptr %u266_out, i64 384
    store <8 x float> zeroinitializer, ptr %u266_tmp_12_1, align 32
    %u266_tmp_13_1 = getelementptr inbounds i8, ptr %u266_out, i64 416
    store <8 x float> zeroinitializer, ptr %u266_tmp_13_1, align 32
    %u266_tmp_14_1 = getelementptr inbounds i8, ptr %u266_out, i64 448
    store <8 x float> zeroinitializer, ptr %u266_tmp_14_1, align 32
    %u266_tmp_15_1 = getelementptr inbounds i8, ptr %u266_out, i64 480
    store <8 x float> zeroinitializer, ptr %u266_tmp_15_1, align 32
    br label %l266_check

l266_check:
    %u266_self = phi <8 x i32> [ %u266_self_initial, %l266_start ], [ %u266_self_next, %l266_call ]
    %u266_next = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %u266_self)
    %u266_valid = icmp ne i32 %u266_next, 0
    br i1 %u266_valid, label %l266_call, label %l266_end

l266_call:
    %u266_bcast_0 = insertelement <8 x i32> undef, i32 %u266_next, i32 0
    %u266_bcast = shufflevector <8 x i32> %u266_bcast_0, <8 x i32> undef, <8 x i32> zeroinitializer
    %u266_active = icmp eq <8 x i32> %u266_self, %u266_bcast
    %u266_func_0 = getelementptr inbounds ptr, ptr %callables, i32 %u266_next
    %u266_func = load ptr, ptr %u266_func_0
    call void %u266_func(<8 x i1> %u266_active, <8 x i32> %r256, ptr %buffer, ptr %rd265, <8 x i32> %u266_offset)
    %u266_self_next = select <8 x i1> %u266_active, <8 x i32> zeroinitializer, <8 x i32> %u266_self
    br label %l266_check

l266_end:
    %u266_out_0_1 = getelementptr inbounds i8, ptr %u266_out, i64 0
    %f267 = load <8 x float>, ptr %u266_out_0_1, align 32
    %u266_out_1_1 = getelementptr inbounds i8, ptr %u266_out, i64 32
    %f271 = load <8 x float>, ptr %u266_out_1_1, align 32
    %u266_out_2_1 = getelementptr inbounds i8, ptr %u266_out, i64 64
    %f280 = load <8 x float>, ptr %u266_out_2_1, align 32
    %u266_out_3_1 = getelementptr inbounds i8, ptr %u266_out, i64 96
    %f281 = load <8 x float>, ptr %u266_out_3_1, align 32
    %u266_out_4_1 = getelementptr inbounds i8, ptr %u266_out, i64 128
    %f282 = load <8 x float>, ptr %u266_out_4_1, align 32
    %u266_out_5_1 = getelementptr inbounds i8, ptr %u266_out, i64 160
    %f279 = load <8 x float>, ptr %u266_out_5_1, align 32
    %u266_out_6_1 = getelementptr inbounds i8, ptr %u266_out, i64 192
    %f278 = load <8 x float>, ptr %u266_out_6_1, align 32
    %u266_out_7_1 = getelementptr inbounds i8, ptr %u266_out, i64 224
    %f277 = load <8 x float>, ptr %u266_out_7_1, align 32
    %u266_out_8_1 = getelementptr inbounds i8, ptr %u266_out, i64 256
    %r268 = load <8 x i32>, ptr %u266_out_8_1, align 32
    %u266_out_9_1 = getelementptr inbounds i8, ptr %u266_out, i64 288
    %f269 = load <8 x float>, ptr %u266_out_9_1, align 32
    %u266_out_10_1 = getelementptr inbounds i8, ptr %u266_out, i64 320
    %f270 = load <8 x float>, ptr %u266_out_10_1, align 32
    %u266_out_11_1 = getelementptr inbounds i8, ptr %u266_out, i64 352
    %f272 = load <8 x float>, ptr %u266_out_11_1, align 32
    %u266_out_12_1 = getelementptr inbounds i8, ptr %u266_out, i64 384
    %f275 = load <8 x float>, ptr %u266_out_12_1, align 32
    %u266_out_13_1 = getelementptr inbounds i8, ptr %u266_out, i64 416
    %f276 = load <8 x float>, ptr %u266_out_13_1, align 32
    %u266_out_14_1 = getelementptr inbounds i8, ptr %u266_out, i64 448
    %f274 = load <8 x float>, ptr %u266_out_14_1, align 32
    %u266_out_15_1 = getelementptr inbounds i8, ptr %u266_out, i64 480
    %f273 = load <8 x float>, ptr %u266_out_15_1, align 32
    br label %l266_done

l266_done:
    %rd283_p1 = getelementptr inbounds ptr, ptr %params, i32 28
    %rd283 = load ptr, ptr %rd283_p1, align 8, !alias.scope !2
    %p284 = xor <8 x i1> %p242, 
    %f285_1 = insertelement <8 x float> undef, float 0x7ff0000000000000, i32 0
    %f285 = shufflevector <8 x float> %f285_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f286 = select <8 x i1> %p284, <8 x float> %f285, <8 x float> %f267
    %p287 = fcmp one <8 x float> %f286, %f285
    %p288 = and <8 x i1> %p242, %p287
    %p289 = xor <8 x i1> %p288, 
    %r290_1 = insertelement <8 x i32> undef, i32 0, i32 0
    %r290 = shufflevector <8 x i32> %r290_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r291 = select <8 x i1> %p289, <8 x i32> %r290, <8 x i32> %r268
    %r292_0 = trunc i64 %index to i32
    %r292_1 = insertelement <8 x i32> undef, i32 %r292_0, i32 0
    %r292_2 = shufflevector <8 x i32> %r292_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r292 = add <8 x i32> %r292_2, 
    %p293_0 = trunc i64 %end to i32
    %p293_1 = insertelement <8 x i32> undef, i32 %p293_0, i32 0
    %p293_2 = shufflevector <8 x i32> %p293_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p293 = icmp ult <8 x i32> %r292, %p293_2
    %p294 = icmp ne <8 x i32> %r291, %r290
    %p295 = and <8 x i1> %p293, %p294
    %r296_1 = getelementptr i32, ptr %rd283, <8 x i32> %r291
    %r296 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %r296_1, i32 4, <8 x i1> %p295, <8 x i32> zeroinitializer)
    %f297 = fneg <8 x float> %f230
    %f298_1 = insertelement <8 x float> undef, float 0x0, i32 0
    %f298 = shufflevector <8 x float> %f298_1, <8 x float> undef, <8 x i32> zeroinitializer
    %p299 = fcmp oeq <8 x float> %f271, %f298
    %p300 = fcmp oeq <8 x float> %f272, %f298
    %p301 = and <8 x i1> %p299, %p300
    %p302 = fcmp oeq <8 x float> %f273, %f298
    %p303 = and <8 x i1> %p301, %p302
    %p304 = fcmp oge <8 x float> %f274, %f298
    %f305 = fneg <8 x float> %f275
    %f306 = select <8 x i1> %p304, <8 x float> %f305, <8 x float> %f275
    %f307 = fmul <8 x float> %f275, %f271
    %f308 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f276, <8 x float> %f272, <8 x float> %f307)
    %f309 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f274, <8 x float> %f273, <8 x float> %f308)
    %f310 = fneg <8 x float> %f309
    %f311 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f274, <8 x float> %f310, <8 x float> %f273)
    %f312_1 = insertelement <8 x float> undef, float 0x3ff0000000000000, i32 0
    %f312 = shufflevector <8 x float> %f312_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f313 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f276, <8 x float> %f310, <8 x float> %f272)
    %f314 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f275, <8 x float> %f310, <8 x float> %f271)
    %f315 = fmul <8 x float> %f314, %f314
    %f316 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f313, <8 x float> %f313, <8 x float> %f315)
    %f317 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f311, <8 x float> %f311, <8 x float> %f316)
    %f318 = fdiv <8 x float> %f312, %f317
    %f319 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f318)
    %f320 = fmul <8 x float> %f311, %f319
    %f321 = select <8 x i1> %p303, <8 x float> %f306, <8 x float> %f320
    %f322 = fneg <8 x float> %f227
    %f323 = fmul <8 x float> %f275, %f276
    %f324_1 = insertelement <8 x float> undef, float 0xbff0000000000000, i32 0
    %f324 = shufflevector <8 x float> %f324_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f325 = select <8 x i1> %p304, <8 x float> %f312, <8 x float> %f324
    %f326 = fadd <8 x float> %f325, %f274
    %f327 = fdiv <8 x float> %f312, %f326
    %f328 = fneg <8 x float> %f327
    %f329 = fmul <8 x float> %f323, %f328
    %f330 = fneg <8 x float> %f329
    %f331 = select <8 x i1> %p304, <8 x float> %f329, <8 x float> %f330
    %f332 = fmul <8 x float> %f313, %f319
    %f333 = select <8 x i1> %p303, <8 x float> %f331, <8 x float> %f332
    %f334 = fneg <8 x float> %f219
    %f335 = fmul <8 x float> %f275, %f275
    %f336 = fmul <8 x float> %f335, %f328
    %f337 = fneg <8 x float> %f336
    %f338 = select <8 x i1> %p304, <8 x float> %f336, <8 x float> %f337
    %f339 = fadd <8 x float> %f338, %f312
    %f340 = fmul <8 x float> %f314, %f319
    %f341 = select <8 x i1> %p303, <8 x float> %f339, <8 x float> %f340
    %f342 = fmul <8 x float> %f334, %f341
    %f343 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f322, <8 x float> %f333, <8 x float> %f342)
    %f344 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f297, <8 x float> %f321, <8 x float> %f343)
    %f345 = select <8 x i1> %p288, <8 x float> %f344, <8 x float> %f334
    %f346 = fmul <8 x float> %f276, %f341
    %f347 = fneg <8 x float> %f346
    %f348 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f275, <8 x float> %f333, <8 x float> %f347)
    %f349 = fmul <8 x float> %f275, %f321
    %f350 = fneg <8 x float> %f349
    %f351 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f274, <8 x float> %f341, <8 x float> %f350)
    %f352 = fmul <8 x float> %f274, %f333
    %f353 = fneg <8 x float> %f352
    %f354 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f276, <8 x float> %f321, <8 x float> %f353)
    %f355 = fmul <8 x float> %f334, %f354
    %f356 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f322, <8 x float> %f351, <8 x float> %f355)
    %f357 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f297, <8 x float> %f348, <8 x float> %f356)
    %f358 = select <8 x i1> %p288, <8 x float> %f357, <8 x float> %f322
    %f359 = fmul <8 x float> %f334, %f275
    %f360 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f322, <8 x float> %f276, <8 x float> %f359)
    %f361 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f297, <8 x float> %f274, <8 x float> %f360)
    %f362 = select <8 x i1> %p288, <8 x float> %f361, <8 x float> %f297
    %rd363_p1 = getelementptr inbounds ptr, ptr %params, i32 29
    %rd363 = load ptr, ptr %rd363_p1, align 8, !alias.scope !2
    %p364 = icmp ne <8 x i32> %r296, %r290
    %p365 = and <8 x i1> %p293, %p364
    %r366_1 = getelementptr i32, ptr %rd363, <8 x i32> %r296
    %r366 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %r366_1, i32 4, <8 x i1> %p365, <8 x i32> zeroinitializer)
    %r367_1 = insertelement <8 x i32> undef, i32 30, i32 0
    %r367 = shufflevector <8 x i32> %r367_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r368 = and <8 x i32> %r366, %r367
    %p369 = icmp ne <8 x i32> %r368, %r290
    %p370 = and <8 x i1> %p287, %p369
    %rd371_1 = insertelement <8 x i64> undef, i64 6364136223846793005, i32 0
    %rd371 = shufflevector <8 x i64> %rd371_1, <8 x i64> undef, <8 x i32> zeroinitializer
    %rd372_0 = mul <8 x i64> %rd257, %rd371
    %rd372 = add <8 x i64> %rd372_0, %rd129
    %rd373 = select <8 x i1> %p370, <8 x i64> %rd372, <8 x i64> %rd257
    %rd374_0 = mul <8 x i64> %rd373, %rd371
    %rd374 = add <8 x i64> %rd374_0, %rd129
    %rd375 = select <8 x i1> %p370, <8 x i64> %rd374, <8 x i64> %rd373
    %rd376_p1 = getelementptr inbounds ptr, ptr %params, i32 30
    %rd376 = load ptr, ptr %rd376_p1, align 8, !alias.scope !2
    %p377 = and <8 x i1> %p293, %p370
    %r378_1 = getelementptr i32, ptr %rd376, <8 x i32> %r290
    %r378 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %r378_1, i32 4, <8 x i1> %p377, <8 x i32> zeroinitializer)
    %f379_1 = insertelement <8 x float> undef, float 0x3ff0000000000000, i32 0
    %f379 = shufflevector <8 x float> %f379_1, <8 x float> undef, <8 x i32> zeroinitializer
    %r380_0 = trunc i64 %index to i32
    %r380_1 = insertelement <8 x i32> undef, i32 %r380_0, i32 0
    %r380_2 = shufflevector <8 x i32> %r380_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r380 = add <8 x i32> %r380_2, 
    %p381_0 = trunc i64 %end to i32
    %p381_1 = insertelement <8 x i32> undef, i32 %p381_0, i32 0
    %p381_2 = shufflevector <8 x i32> %p381_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p381 = icmp ult <8 x i32> %r380, %p381_2
    %r382_1 = insertelement <8 x i32> undef, i32 0, i32 0
    %r382 = shufflevector <8 x i32> %r382_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p383 = icmp ne <8 x i32> %r378, %r382
    %p384 = and <8 x i1> %p370, %p383
    %p385 = and <8 x i1> %p381, %p384
    %f386 = select <8 x i1> %p385, <8 x float> %f379, <8 x float> zeroinitializer
    %rd387_p1 = getelementptr inbounds ptr, ptr %params, i32 31
    %rd387 = load ptr, ptr %rd387_p1, align 8, !alias.scope !2
    %rd388_p1 = getelementptr inbounds ptr, ptr %params, i32 32
    %rd388 = load ptr, ptr %rd388_p1, align 8, !alias.scope !2

    br label %l389_start

l389_start:
    ; VCall: mitsuba::Emitter::sample_direction()
    %u389_self_ptr = getelementptr i64, ptr %rd387, <8 x i32> %r378
    %u389_self_combined = call <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr> %u389_self_ptr, i32 8, <8 x i1> %p385, <8 x i64> zeroinitializer)
    %u389_self_initial = trunc <8 x i64> %u389_self_combined to <8 x i32>
    %u389_offset_1 = lshr <8 x i64> %u389_self_combined, 
    %u389_offset = trunc <8 x i64> %u389_offset_1 to <8 x i32>
    %u389_in_0_1 = getelementptr inbounds i8, ptr %buffer, i32 0
    store <8 x float> %f280, ptr %u389_in_0_1, align 32
    %u389_in_1_1 = getelementptr inbounds i8, ptr %buffer, i32 32
    store <8 x float> %f281, ptr %u389_in_1_1, align 32
    %u389_in_2_1 = getelementptr inbounds i8, ptr %buffer, i32 64
    store <8 x float> %f282, ptr %u389_in_2_1, align 32
    %u389_out = getelementptr i8, ptr %buffer, i32 96
    %u389_tmp_0_1 = getelementptr inbounds i8, ptr %u389_out, i64 0
    store <8 x float> zeroinitializer, ptr %u389_tmp_0_1, align 32
    %u389_tmp_1_1 = getelementptr inbounds i8, ptr %u389_out, i64 32
    store <8 x float> zeroinitializer, ptr %u389_tmp_1_1, align 32
    %u389_tmp_2_1 = getelementptr inbounds i8, ptr %u389_out, i64 64
    store <8 x float> zeroinitializer, ptr %u389_tmp_2_1, align 32
    %u389_tmp_3_1 = getelementptr inbounds i8, ptr %u389_out, i64 96
    store <8 x float> zeroinitializer, ptr %u389_tmp_3_1, align 32
    %u389_tmp_4_1 = getelementptr inbounds i8, ptr %u389_out, i64 128
    store <8 x float> zeroinitializer, ptr %u389_tmp_4_1, align 32
    %u389_tmp_5_1 = getelementptr inbounds i8, ptr %u389_out, i64 160
    store <8 x float> zeroinitializer, ptr %u389_tmp_5_1, align 32
    %u389_tmp_8_1 = getelementptr inbounds i8, ptr %u389_out, i64 192
    store <8 x float> zeroinitializer, ptr %u389_tmp_8_1, align 32
    %u389_tmp_9_1 = getelementptr inbounds i8, ptr %u389_out, i64 224
    store <8 x float> zeroinitializer, ptr %u389_tmp_9_1, align 32
    %u389_tmp_10_1 = getelementptr inbounds i8, ptr %u389_out, i64 256
    store <8 x float> zeroinitializer, ptr %u389_tmp_10_1, align 32
    br label %l389_check

l389_check:
    %u389_self = phi <8 x i32> [ %u389_self_initial, %l389_start ], [ %u389_self_next, %l389_call ]
    %u389_next = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %u389_self)
    %u389_valid = icmp ne i32 %u389_next, 0
    br i1 %u389_valid, label %l389_call, label %l389_end

l389_call:
    %u389_bcast_0 = insertelement <8 x i32> undef, i32 %u389_next, i32 0
    %u389_bcast = shufflevector <8 x i32> %u389_bcast_0, <8 x i32> undef, <8 x i32> zeroinitializer
    %u389_active = icmp eq <8 x i32> %u389_self, %u389_bcast
    %u389_func_0 = getelementptr inbounds ptr, ptr %callables, i32 %u389_next
    %u389_func = load ptr, ptr %u389_func_0
    call void %u389_func(<8 x i1> %u389_active, <8 x i32> %r378, ptr %buffer, ptr %rd388, <8 x i32> %u389_offset)
    %u389_self_next = select <8 x i1> %u389_active, <8 x i32> zeroinitializer, <8 x i32> %u389_self
    br label %l389_check

l389_end:
    %u389_out_0_1 = getelementptr inbounds i8, ptr %u389_out, i64 0
    %f392 = load <8 x float>, ptr %u389_out_0_1, align 32
    %u389_out_1_1 = getelementptr inbounds i8, ptr %u389_out, i64 32
    %f391 = load <8 x float>, ptr %u389_out_1_1, align 32
    %u389_out_2_1 = getelementptr inbounds i8, ptr %u389_out, i64 64
    %f390 = load <8 x float>, ptr %u389_out_2_1, align 32
    %u389_out_3_1 = getelementptr inbounds i8, ptr %u389_out, i64 96
    %f396 = load <8 x float>, ptr %u389_out_3_1, align 32
    %u389_out_4_1 = getelementptr inbounds i8, ptr %u389_out, i64 128
    %f395 = load <8 x float>, ptr %u389_out_4_1, align 32
    %u389_out_5_1 = getelementptr inbounds i8, ptr %u389_out, i64 160
    %f394 = load <8 x float>, ptr %u389_out_5_1, align 32
    %u389_out_8_1 = getelementptr inbounds i8, ptr %u389_out, i64 192
    %f393 = load <8 x float>, ptr %u389_out_8_1, align 32
    %u389_out_9_1 = getelementptr inbounds i8, ptr %u389_out, i64 224
    %f397 = load <8 x float>, ptr %u389_out_9_1, align 32
    %u389_out_10_1 = getelementptr inbounds i8, ptr %u389_out, i64 256
    %f398 = load <8 x float>, ptr %u389_out_10_1, align 32
    br label %l389_done

l389_done:
    %f399_1 = insertelement <8 x float> undef, float 0x0, i32 0
    %f399 = shufflevector <8 x float> %f399_1, <8 x float> undef, <8 x i32> zeroinitializer
    %p400 = fcmp one <8 x float> %f386, %f399
    %p401 = and <8 x i1> %p370, %p400
    %rd402_p1 = getelementptr inbounds ptr, ptr %params, i32 33
    %rd402 = load ptr, ptr %rd402_p1, align 8, !alias.scope !2
    %rd403_p1 = getelementptr inbounds ptr, ptr %params, i32 34
    %rd403 = load ptr, ptr %rd403_p1, align 8, !alias.scope !2
    %p404_1 = insertelement <8 x i1> undef, i1 0, i32 0
    %p404 = shufflevector <8 x i1> %p404_1, <8 x i1> undef, <8 x i32> zeroinitializer
    %r405_0 = trunc i64 %index to i32
    %r405_1 = insertelement <8 x i32> undef, i32 %r405_0, i32 0
    %r405_2 = shufflevector <8 x i32> %r405_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r405 = add <8 x i32> %r405_2, 
    %p406_0 = trunc i64 %end to i32
    %p406_1 = insertelement <8 x i32> undef, i32 %p406_0, i32 0
    %p406_2 = shufflevector <8 x i32> %p406_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %p406 = icmp ult <8 x i32> %r405, %p406_2
    %p407 = and <8 x i1> %p406, %p401
    %r408_1 = insertelement <8 x i32> undef, i32 4294967295, i32 0
    %r408 = shufflevector <8 x i32> %r408_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r409_1 = insertelement <8 x i32> undef, i32 0, i32 0
    %r409 = shufflevector <8 x i32> %r409_1, <8 x i32> undef, <8 x i32> zeroinitializer
    %r410 = select <8 x i1> %p407, <8 x i32> %r408, <8 x i32> %r409
    %f411 = fsub <8 x float> %f390, %f282
    %f412 = fsub <8 x float> %f391, %f281
    %f413 = fsub <8 x float> %f392, %f280
    %f414 = fmul <8 x float> %f279, %f413
    %f415 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f278, <8 x float> %f412, <8 x float> %f414)
    %f416 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f277, <8 x float> %f411, <8 x float> %f415)
    %p417 = fcmp oge <8 x float> %f416, %f399
    %f418_1 = insertelement <8 x float> undef, float 0x3ff0000000000000, i32 0
    %f418 = shufflevector <8 x float> %f418_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f419 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %f280)
    %f420 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %f281)
    %f421 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %f419, <8 x float> %f420)
    %f422 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %f282)
    %f423 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %f421, <8 x float> %f422)
    %f424 = fadd <8 x float> %f418, %f423
    %f425_1 = insertelement <8 x float> undef, float 0x3f17700000000000, i32 0
    %f425 = shufflevector <8 x float> %f425_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f426 = fmul <8 x float> %f424, %f425
    %f427 = fneg <8 x float> %f426
    %f428 = select <8 x i1> %p417, <8 x float> %f426, <8 x float> %f427
    %f429 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f428, <8 x float> %f279, <8 x float> %f280)
    %f430 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f428, <8 x float> %f278, <8 x float> %f281)
    %f431 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f428, <8 x float> %f277, <8 x float> %f282)
    %f432 = fsub <8 x float> %f392, %f429
    %f433 = fsub <8 x float> %f390, %f431
    %f434 = fsub <8 x float> %f391, %f430
    %f435 = fmul <8 x float> %f432, %f432
    %f436 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f434, <8 x float> %f434, <8 x float> %f435)
    %f437 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f433, <8 x float> %f433, <8 x float> %f436)
    %f438 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f437)
    %f439 = fdiv <8 x float> %f418, %f438
    %f440 = fmul <8 x float> %f432, %f439
    %f441 = fmul <8 x float> %f434, %f439
    %f442 = fmul <8 x float> %f433, %f439
    %f443_1 = insertelement <8 x float> undef, float 0x3feff8ad00000000, i32 0
    %f443 = shufflevector <8 x float> %f443_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f444 = fmul <8 x float> %f438, %f443
    %r445_1 = insertelement <8 x i32> undef, i32 0, i32 0
    %r445 = shufflevector <8 x i32> %r445_1, <8 x i32> undef, <8 x i32> zeroinitializer

    ; -------- Ray test -------
    %u446_in_0_1 = getelementptr inbounds i8, ptr %buffer, i32 0
    store <8 x i32> %r410, ptr %u446_in_0_1, align 32
    %u446_in_1_1 = getelementptr inbounds i8, ptr %buffer, i32 32
    store <8 x float> %f429, ptr %u446_in_1_1, align 32
    %u446_in_2_1 = getelementptr inbounds i8, ptr %buffer, i32 64
    store <8 x float> %f430, ptr %u446_in_2_1, align 32
    %u446_in_3_1 = getelementptr inbounds i8, ptr %buffer, i32 96
    store <8 x float> %f431, ptr %u446_in_3_1, align 32
    %u446_in_4_1 = getelementptr inbounds i8, ptr %buffer, i32 128
    store <8 x float> %f399, ptr %u446_in_4_1, align 32
    %u446_in_5_1 = getelementptr inbounds i8, ptr %buffer, i32 160
    store <8 x float> %f440, ptr %u446_in_5_1, align 32
    %u446_in_6_1 = getelementptr inbounds i8, ptr %buffer, i32 192
    store <8 x float> %f441, ptr %u446_in_6_1, align 32
    %u446_in_7_1 = getelementptr inbounds i8, ptr %buffer, i32 224
    store <8 x float> %f442, ptr %u446_in_7_1, align 32
    %u446_in_8_1 = getelementptr inbounds i8, ptr %buffer, i32 256
    store <8 x float> %f43, ptr %u446_in_8_1, align 32
    %u446_in_9_1 = getelementptr inbounds i8, ptr %buffer, i32 288
    store <8 x float> %f444, ptr %u446_in_9_1, align 32
    %u446_in_10_1 = getelementptr inbounds i8, ptr %buffer, i32 320
    store <8 x i32> %r445, ptr %u446_in_10_1, align 32
    %u446_in_11_1 = getelementptr inbounds i8, ptr %buffer, i32 352
    store <8 x i32> %r445, ptr %u446_in_11_1, align 32
    %u446_in_12_1 = getelementptr inbounds i8, ptr %buffer, i32 384
    store <8 x i32> %r445, ptr %u446_in_12_1, align 32
    %u446_in_ctx_1 = getelementptr inbounds i8, ptr %buffer, i32 416
    store <6 x i32> , ptr %u446_in_ctx_1, align 4
    call void %rd402(ptr %u446_in_0_1, ptr %rd403, ptr %u446_in_ctx_1, ptr %u446_in_1_1)
    %u446_out_0_1 = getelementptr inbounds i8, ptr %buffer, i32 288
    %u446_out_0 = load <8 x float>, ptr %u446_out_0_1, align 32
    ; -------------------

    %f447 = bitcast <8 x float> %u446_out_0 to <8 x float>
    %p448 = fcmp one <8 x float> %f447, %f444
    %p449 = and <8 x i1> %p401, %p448
    %f450 = select <8 x i1> %p449, <8 x float> %f399, <8 x float> %f386
    %p451 = fcmp one <8 x float> %f450, %f399
    %p452 = and <8 x i1> %p370, %p451
    %f453 = select <8 x i1> %p449, <8 x float> %f399, <8 x float> %f393
    %f454 = fmul <8 x float> %f396, %f341
    %f455 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f395, <8 x float> %f333, <8 x float> %f454)
    %f456 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f394, <8 x float> %f321, <8 x float> %f455)
    %f457 = fmul <8 x float> %f396, %f354
    %f458 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f395, <8 x float> %f351, <8 x float> %f457)
    %f459 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f394, <8 x float> %f348, <8 x float> %f458)
    %f460 = fmul <8 x float> %f396, %f275
    %f461 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f395, <8 x float> %f276, <8 x float> %f460)
    %f462 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f394, <8 x float> %f274, <8 x float> %f461)
    %f463 = select <8 x i1> %p449, <8 x float> %f399, <8 x float> %f397
    %f464 = select <8 x i1> %p449, <8 x float> %f399, <8

mitsuba-renderer / mitsuba3