certik / fastGPT

Fast GPT-2 inference written in Fortran
MIT License
180 stars 16 forks source link

maxval #49

Closed certik closed 1 year ago

certik commented 1 year ago

https://github.com/certik/fastGPT/commit/4f778d2850fa5cad1ced6b630b1b7e226663cccb

Smit-create commented 1 year ago

I suppose this is fixed too: https://github.com/lfortran/lfortran/pull/1901? Can you please verify? Thanks!

certik commented 1 year ago

I tried, and it failed with:

code generation error: asr_to_llvm: module failed verification. Error:
Call parameter type does not match function signature!
  %call_arg_value = alloca %array*, align 8
 %array*  %20 = call i32 @MaxVal_4_1_0(%array** %call_arg_value)

The commit above shows the few cases where it is used. So we need to isolate it and fix.

certik commented 1 year ago

It works for maxval(x(:,i))), but fails for maxval(m%byte_encoder).

certik commented 1 year ago

Here is a minimal reproducible example, that can be used as a test:

program test_maxval
type :: model_t
    integer, allocatable :: byte_encoder(:)
end type
type(model_t) :: m
allocate(m%byte_encoder(5))
m%byte_encoder = 5
if (maxval(m%byte_encoder) /= 5) error stop
end program

This gives:

$ lfortran a.f90 
; ModuleID = 'LFortran'
source_filename = "LFortran"

%array = type { i32*, i32, %dimension_descriptor*, i1, i32 }
%dimension_descriptor = type { i32, i32, i32 }
%model_t = type { %array* }

@0 = private unnamed_addr constant [12 x i8] c"ERROR STOP\0A\00", align 1

define i32 @MaxVal_4_1_0(%array* %array) {
.entry:
  %__1_i = alloca i32, align 4
  %result = alloca i32, align 4
  store i32 -2147483648, i32* %result, align 4
  %0 = getelementptr %array, %array* %array, i32 0, i32 2
  %1 = load %dimension_descriptor*, %dimension_descriptor** %0, align 8
  %2 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %1, i32 0
  %3 = getelementptr %dimension_descriptor, %dimension_descriptor* %2, i32 0, i32 1
  %4 = load i32, i32* %3, align 4
  %5 = sub i32 %4, 1
  store i32 %5, i32* %__1_i, align 4
  br label %loop.head

loop.head:                                        ; preds = %loop.body, %.entry
  %6 = load i32, i32* %__1_i, align 4
  %7 = add i32 %6, 1
  %8 = getelementptr %array, %array* %array, i32 0, i32 2
  %9 = load %dimension_descriptor*, %dimension_descriptor** %8, align 8
  %10 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %9, i32 0
  %11 = getelementptr %dimension_descriptor, %dimension_descriptor* %10, i32 0, i32 1
  %12 = load i32, i32* %11, align 4
  %13 = getelementptr %dimension_descriptor, %dimension_descriptor* %10, i32 0, i32 2
  %14 = load i32, i32* %13, align 4
  %15 = add i32 %14, %12
  %16 = sub i32 %15, 1
  %17 = icmp sle i32 %7, %16
  br i1 %17, label %loop.body, label %loop.end

loop.body:                                        ; preds = %loop.head
  %18 = load i32, i32* %__1_i, align 4
  %19 = add i32 %18, 1
  store i32 %19, i32* %__1_i, align 4
  %20 = load i32, i32* %result, align 4
  %21 = load i32, i32* %__1_i, align 4
  %22 = getelementptr %array, %array* %array, i32 0, i32 2
  %23 = load %dimension_descriptor*, %dimension_descriptor** %22, align 8
  %24 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %23, i32 0
  %25 = getelementptr %dimension_descriptor, %dimension_descriptor* %24, i32 0, i32 1
  %26 = load i32, i32* %25, align 4
  %27 = sub i32 %21, %26
  %28 = getelementptr %dimension_descriptor, %dimension_descriptor* %24, i32 0, i32 0
  %29 = load i32, i32* %28, align 4
  %30 = mul i32 %29, %27
  %31 = add i32 0, %30
  %32 = getelementptr %array, %array* %array, i32 0, i32 1
  %33 = load i32, i32* %32, align 4
  %34 = add i32 %31, %33
  %35 = getelementptr %array, %array* %array, i32 0, i32 0
  %36 = load i32*, i32** %35, align 8
  %37 = getelementptr inbounds i32, i32* %36, i32 %34
  %38 = load i32, i32* %37, align 4
  %39 = icmp sgt i32 %20, %38
  %40 = load i32, i32* %result, align 4
  %41 = load i32, i32* %__1_i, align 4
  %42 = getelementptr %array, %array* %array, i32 0, i32 2
  %43 = load %dimension_descriptor*, %dimension_descriptor** %42, align 8
  %44 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %43, i32 0
  %45 = getelementptr %dimension_descriptor, %dimension_descriptor* %44, i32 0, i32 1
  %46 = load i32, i32* %45, align 4
  %47 = sub i32 %41, %46
  %48 = getelementptr %dimension_descriptor, %dimension_descriptor* %44, i32 0, i32 0
  %49 = load i32, i32* %48, align 4
  %50 = mul i32 %49, %47
  %51 = add i32 0, %50
  %52 = getelementptr %array, %array* %array, i32 0, i32 1
  %53 = load i32, i32* %52, align 4
  %54 = add i32 %51, %53
  %55 = getelementptr %array, %array* %array, i32 0, i32 0
  %56 = load i32*, i32** %55, align 8
  %57 = getelementptr inbounds i32, i32* %56, i32 %54
  %58 = load i32, i32* %57, align 4
  %59 = select i1 %39, i32 %40, i32 %58
  store i32 %59, i32* %result, align 4
  br label %loop.head

loop.end:                                         ; preds = %loop.head
  br label %return

return:                                           ; preds = %loop.end
  %60 = load i32, i32* %result, align 4
  ret i32 %60
}

define i32 @main(i32 %0, i8** %1) {
.entry:
  %call_arg_value = alloca %array*, align 8
  call void @_lpython_set_argv(i32 %0, i8** %1)
  %__1_t = alloca i32, align 4
  %m = alloca %model_t, align 8
  %2 = getelementptr %model_t, %model_t* %m, i32 0, i32 0
  store %array* null, %array** %2, align 8
  %arr_desc = alloca %array, align 8
  %3 = getelementptr %array, %array* %arr_desc, i32 0, i32 2
  %4 = alloca i32, align 4
  store i32 1, i32* %4, align 4
  %5 = load i32, i32* %4, align 4
  %6 = alloca %dimension_descriptor, i32 %5, align 8
  store %dimension_descriptor* %6, %dimension_descriptor** %3, align 8
  %7 = getelementptr %array, %array* %arr_desc, i32 0, i32 4
  store i32 1, i32* %7, align 4
  %8 = getelementptr %array, %array* %arr_desc, i32 0, i32 0
  store i32* null, i32** %8, align 8
  store %array* %arr_desc, %array** %2, align 8
  %9 = getelementptr %model_t, %model_t* %m, i32 0, i32 0
  %10 = load %array*, %array** %9, align 8
  %11 = getelementptr %array, %array* %10, i32 0, i32 1
  store i32 0, i32* %11, align 4
  %12 = getelementptr %array, %array* %10, i32 0, i32 2
  %13 = load %dimension_descriptor*, %dimension_descriptor** %12, align 8
  %14 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %13, i32 0
  %15 = getelementptr %dimension_descriptor, %dimension_descriptor* %14, i32 0, i32 0
  %16 = getelementptr %dimension_descriptor, %dimension_descriptor* %14, i32 0, i32 1
  %17 = getelementptr %dimension_descriptor, %dimension_descriptor* %14, i32 0, i32 2
  store i32 1, i32* %15, align 4
  store i32 1, i32* %16, align 4
  store i32 5, i32* %17, align 4
  %18 = getelementptr %array, %array* %10, i32 0, i32 0
  %19 = alloca i32, align 4
  store i32 20, i32* %19, align 4
  %20 = load i32, i32* %19, align 4
  %21 = call i8* (i32, ...) @_lfortran_malloc(i32 %20)
  %22 = bitcast i8* %21 to i32*
  store i32* %22, i32** %18, align 8
  %23 = getelementptr %model_t, %model_t* %m, i32 0, i32 0
  %24 = load %array*, %array** %23, align 8
  %25 = getelementptr %array, %array* %24, i32 0, i32 2
  %26 = load %dimension_descriptor*, %dimension_descriptor** %25, align 8
  %27 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %26, i32 0
  %28 = getelementptr %dimension_descriptor, %dimension_descriptor* %27, i32 0, i32 1
  %29 = load i32, i32* %28, align 4
  %30 = sub i32 %29, 1
  store i32 %30, i32* %__1_t, align 4
  br label %loop.head

loop.head:                                        ; preds = %loop.body, %.entry
  %31 = load i32, i32* %__1_t, align 4
  %32 = add i32 %31, 1
  %33 = getelementptr %model_t, %model_t* %m, i32 0, i32 0
  %34 = load %array*, %array** %33, align 8
  %35 = getelementptr %array, %array* %34, i32 0, i32 2
  %36 = load %dimension_descriptor*, %dimension_descriptor** %35, align 8
  %37 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %36, i32 0
  %38 = getelementptr %dimension_descriptor, %dimension_descriptor* %37, i32 0, i32 1
  %39 = load i32, i32* %38, align 4
  %40 = getelementptr %dimension_descriptor, %dimension_descriptor* %37, i32 0, i32 2
  %41 = load i32, i32* %40, align 4
  %42 = add i32 %41, %39
  %43 = sub i32 %42, 1
  %44 = icmp sle i32 %32, %43
  br i1 %44, label %loop.body, label %loop.end

loop.body:                                        ; preds = %loop.head
  %45 = load i32, i32* %__1_t, align 4
  %46 = add i32 %45, 1
  store i32 %46, i32* %__1_t, align 4
  %47 = getelementptr %model_t, %model_t* %m, i32 0, i32 0
  %48 = load i32, i32* %__1_t, align 4
  %49 = load %array*, %array** %47, align 8
  %50 = getelementptr %array, %array* %49, i32 0, i32 2
  %51 = load %dimension_descriptor*, %dimension_descriptor** %50, align 8
  %52 = getelementptr inbounds %dimension_descriptor, %dimension_descriptor* %51, i32 0
  %53 = getelementptr %dimension_descriptor, %dimension_descriptor* %52, i32 0, i32 1
  %54 = load i32, i32* %53, align 4
  %55 = sub i32 %48, %54
  %56 = getelementptr %dimension_descriptor, %dimension_descriptor* %52, i32 0, i32 0
  %57 = load i32, i32* %56, align 4
  %58 = mul i32 %57, %55
  %59 = add i32 0, %58
  %60 = getelementptr %array, %array* %49, i32 0, i32 1
  %61 = load i32, i32* %60, align 4
  %62 = add i32 %59, %61
  %63 = getelementptr %array, %array* %49, i32 0, i32 0
  %64 = load i32*, i32** %63, align 8
  %65 = getelementptr inbounds i32, i32* %64, i32 %62
  store i32 5, i32* %65, align 4
  br label %loop.head

loop.end:                                         ; preds = %loop.head
  %66 = getelementptr %model_t, %model_t* %m, i32 0, i32 0
  %67 = load %array*, %array** %66, align 8
  store %array* %67, %array** %call_arg_value, align 8
  %68 = call i32 @MaxVal_4_1_0(%array** %call_arg_value)
  %69 = icmp ne i32 %68, 5
  br i1 %69, label %then, label %else

then:                                             ; preds = %loop.end
  call void (i8*, ...) @_lcompilers_print_error(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @0, i32 0, i32 0))
  call void @exit(i32 1)
  br label %ifcont

else:                                             ; preds = %loop.end
  br label %ifcont

ifcont:                                           ; preds = %else, %then
  ret i32 0
}

declare void @_lpython_set_argv(i32, i8**)

declare i8* @_lfortran_malloc(i32, ...)

declare void @_lcompilers_print_error(i8*, ...)

declare void @exit(i32)
code generation error: asr_to_llvm: module failed verification. Error:
Call parameter type does not match function signature!
  %call_arg_value = alloca %array*, align 8
 %array*  %68 = call i32 @MaxVal_4_1_0(%array** %call_arg_value)

Note: Please report unclear or confusing messages as bugs at
https://github.com/lfortran/lfortran/issues.
Smit-create commented 1 year ago

I see, it requires the LOAD pointer operation to match the call parameters.

certik commented 1 year ago

Yes, it looks like it is loading it one too many times. Instead of:

  %67 = load %array*, %array** %66, align 8
  store %array* %67, %array** %call_arg_value, align 8
  %68 = call i32 @MaxVal_4_1_0(%array** %call_arg_value)

it seems it should just be:

  %67 = load %array*, %array** %66, align 8
  %68 = call i32 @MaxVal_4_1_0(%array* %67)
czgdp1807 commented 1 year ago

It works for maxval(x(:,i))), but fails for maxval(m%byte_encoder).

Because the first is handled by array_op pass but the second is a StructInstanceMember which we need to load one more time. You can do it in convert_call_args by checking for ASR::is_a<ASR::StructInstanceMember_t> and then load it.

czgdp1807 commented 1 year ago

Let me just get it done.