tinygo-org / tinygo

Go compiler for small places. Microcontrollers, WebAssembly (WASM/WASI), and command-line tools. Based on LLVM.
https://tinygo.org
Other
15.48k stars 913 forks source link

Error mapping of registers for in-line asm #1747

Open ewwaller opened 3 years ago

ewwaller commented 3 years ago

I think this is a bug in the way inline assembly is processed. I have some code that works properly with the -opt 1 flag, but does not work with the default optimization. The following minimized example includes code from the ws2812 driver. Running on a Adafruit Feather M4, this code displays increasing brightness of red, then green, then blue on the smart LED on that board (AKA Neopixel)

This is done by transmitting three bytes, each encoded as 8 pulses (for a total of 24 pulses) to the device. A narrow pulse codes a 0, and a wide pulse codes a 1

When it works (compiled with -opt 1) , the (correct) output waveform looks like this:

SDS00006

But, with no optimization directive, it looks like this: SDS00007

In this case, there are initial pulses for each of the three bytes, but the output never goes high for subsequent bits. The time between bytes is the same as had been for the working example.

The tinygo go is :

package main

import (
    "device/arm"
    "image/color"
    "machine"
    "time"
)

func main() {
    const Period uint8 = 10
    var c int
    var r, g, b uint8
    var theColors [1]color.RGBA
    thePixel := Device{machine.D8}

    for {
        theColors[0] = color.RGBA{
            R: r,
            G: g,
            B: b,
        }
        thePixel.WriteColors(theColors[:])
        time.Sleep(time.Duration(Period) * time.Millisecond)
        switch c {
        case 0:
            r++
            if r == 0 {
                c++
            }
        case 1:
            g++
            if g == 0 {
                c++
            }
        case 2:
            b++
            if b == 0 {
                c++
            }
        default:
        }
        c = c % 3
    }
}

// Device wraps a pin object for an easy driver interface.
type Device struct {
    Pin machine.Pin
}

// Write the raw bitstring out using the WS2812 protocol.
func (d Device) Write(buf []byte) (n int, err error) {
    for _, c := range buf {
        d.WriteByte(c)
    }
    return len(buf), nil
}

// Write the given color slice out using the WS2812 protocol.
// Colors are sent out in the usual GRB format.
func (d Device) WriteColors(buf []color.RGBA) error {
    for _, color := range buf {
        d.WriteByte(color.G) // green
        d.WriteByte(color.R) // red
        d.WriteByte(color.B) // blue
    }
    return nil
}

// Send a single byte using the WS2812 protocol.
func (d Device) WriteByte(c byte) error {
    // For the Cortex-M4 at 120MHz
    portSet, maskSet := d.Pin.PortMaskSet()
    portClear, maskClear := d.Pin.PortMaskClear()
    value := uint32(c) << 24
    var i uint32 = 8
    mask := arm.DisableInterrupts()
    arm.AsmFull(`
    1: @ send_bit
        str   {maskSet}, {portSet}     @ [2]   T0H and T0L start here
        lsls  {value}, #1              @ [1]
        nop                            @ [28]
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        bcs.n 2f                       @ [1-3] skip_store
        str   {maskClear}, {portClear} @ [2]   T0H -> T0L transition
    2: @ skip_store
        nop                            @ [41]
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        str   {maskClear}, {portClear} @ [2]   T1H -> T1L transition
        nop                            @ [54]
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
        subs  {i}, #1                  @ [1]
        bne.n 1b                       @ [1-3] send_bit
    `, map[string]interface{}{
        "value":     value,
        "i":         i,
        "maskSet":   maskSet,
        "portSet":   portSet,
        "maskClear": maskClear,
        "portClear": portClear,
    })
    arm.EnableInterrupts(mask)
    return nil
}

The inline code uses a register to count of the 8 pulses in a register which is preset to 8 by the wrapping code. The wrapping code also figues out the address of the register to manipulate to control the output and the data that need to be written. All of those are passed into the inline code using the map clause. The wrapping code calls machine functions to get the port information.

Looking at the disassembled code of the working code (-opt 1) for WriteByte we get:

    6d08:       e92d 41f0       stmdb   sp!, {r4, r5, r6, r7, r8, lr}
    6d0c:       4604            mov     r4, r0
    6d0e:       f7fd fe86       bl      4a1e <(machine.Pin).PortMaskSet>
    6d12:       4680            mov     r8, r0
    6d14:       460e            mov     r6, r1
    6d16:       f7fd fe72       bl      49fe <(machine.Pin).PortMaskClear>
    6d1a:       4607            mov     r7, r0
    6d1c:       460d            mov     r5, r1
    6d1e:       0624            lsls    r4, r4, #24
    6d20:       f7fd fb43       bl      43aa <device/arm.DisableInterrupts>
    6d24:       2108            movs    r1, #8
    6d26:       f8c8 6000       str.w   r6, [r8]
    6d2a:       0064            lsls    r4, r4, #1
    6d2c:       bf00            nop
    6d2e:       bf00            nop
    6d30:       bf00            nop
    6d32:       bf00            nop
    6d34:       bf00            nop
    6d36:       bf00            nop
    6d38:       bf00            nop
    6d3a:       bf00            nop
    6d3c:       bf00            nop
    6d3e:       bf00            nop
    6d40:       bf00            nop
    6d42:       bf00            nop
    6d44:       bf00            nop
    6d46:       bf00            nop
    6d48:       bf00            nop
    6d4a:       bf00            nop
    6d4c:       bf00            nop
    6d4e:       bf00            nop
    6d50:       bf00            nop
    6d52:       bf00            nop
    6d54:       bf00            nop
    6d56:       d200            bcs.n   6d5a <(main.Device).WriteByte+0x52>
    6d58:       603d            str     r5, [r7, #0]
    6d5a:       bf00            nop
    6d5c:       bf00            nop
    6d5e:       bf00            nop
    6d60:       bf00            nop
    6d62:       bf00            nop
    6d64:       bf00            nop
    6d66:       bf00            nop
    6d68:       bf00            nop
    6d6a:       bf00            nop
    6d6c:       bf00            nop
    6d6e:       bf00            nop
    6d70:       bf00            nop
    6d72:       bf00            nop
    6d74:       bf00            nop
    6d76:       bf00            nop
    6d78:       bf00            nop
    6d7a:       bf00            nop
    6d7c:       bf00            nop
    6d7e:       bf00            nop
    6d80:       bf00            nop
    6d82:       bf00            nop
    6d84:       bf00            nop
    6d86:       bf00            nop
    6d88:       bf00            nop
    6d8a:       bf00            nop
    6d8c:       bf00            nop
    6d8e:       bf00            nop
    6d90:       bf00            nop
    6d92:       bf00            nop
    6d94:       bf00            nop
    6d96:       bf00            nop
    6d98:       bf00            nop
    6d9a:       bf00            nop
    6d9c:       bf00            nop
    6d9e:       bf00            nop
    6da0:       bf00            nop
    6da2:       bf00            nop
    6da4:       bf00            nop
    6da6:       bf00            nop
    6da8:       603d            str     r5, [r7, #0]
    6daa:       bf00            nop
    6dac:       bf00            nop
    6dae:       bf00            nop
    6db0:       bf00            nop
    6db2:       bf00            nop
    6db4:       bf00            nop
    6db6:       bf00            nop
    6db8:       bf00            nop
    6dba:       bf00            nop
    6dbc:       bf00            nop
    6dbe:       bf00            nop
    6dc0:       bf00            nop
    6dc2:       bf00            nop
    6dc4:       bf00            nop
    6dc6:       bf00            nop
    6dc8:       bf00            nop
    6dca:       bf00            nop
    6dcc:       bf00            nop
    6dce:       bf00            nop
    6dd0:       bf00            nop
    6dd2:       bf00            nop
    6dd4:       bf00            nop
    6dd6:       bf00            nop
    6dd8:       bf00            nop
    6dda:       bf00            nop
    6ddc:       bf00            nop
    6dde:       bf00            nop
    6de0:       bf00            nop
    6de2:       bf00            nop
    6de4:       bf00            nop
    6de6:       bf00            nop
    6de8:       bf00            nop
    6dea:       bf00            nop
    6dec:       bf00            nop
    6dee:       bf00            nop
    6df0:       bf00            nop
    6df2:       bf00            nop
    6df4:       bf00            nop
    6df6:       bf00            nop
    6df8:       bf00            nop
    6dfa:       bf00            nop
    6dfc:       bf00            nop
    6dfe:       bf00            nop
    6e00:       bf00            nop
    6e02:       bf00            nop
    6e04:       bf00            nop
    6e06:       3901            subs    r1, #1
    6e08:       d18d            bne.n   6d26 <(main.Device).WriteByte+0x1e>
    6e0a:       f7fd faf2       bl      43f2 <device/arm.EnableInterrupts>
    6e0e:       e8bd 81f0       ldmia.w sp!, {r4, r5, r6, r7, r8, pc}

Whereas the optimized, non-working code :

    5caa:       f248 0394       movw    r3, #32916      ; 0x8094
    5cae:       f248 0298       movw    r2, #32920      ; 0x8098
    5cb2:       0600            lsls    r0, r0, #24
    5cb4:       f3ef 8110       mrs     r1, PRIMASK
    5cb8:       b672            cpsid   i
    5cba:       f04f 0c08       mov.w   ip, #8
    5cbe:       f2c4 1300       movt    r3, #16640      ; 0x4100
    5cc2:       f2c4 1200       movt    r2, #16640      ; 0x4100
    5cc6:       f8c2 c000       str.w   ip, [r2]
    5cca:       0040            lsls    r0, r0, #1
    5ccc:       bf00            nop
    5cce:       bf00            nop
    5cd0:       bf00            nop
    5cd2:       bf00            nop
    5cd4:       bf00            nop
    5cd6:       bf00            nop
    5cd8:       bf00            nop
    5cda:       bf00            nop
    5cdc:       bf00            nop
    5cde:       bf00            nop
    5ce0:       bf00            nop
    5ce2:       bf00            nop
    5ce4:       bf00            nop
    5ce6:       bf00            nop
    5ce8:       bf00            nop
    5cea:       bf00            nop
    5cec:       bf00            nop
    5cee:       bf00            nop
    5cf0:       bf00            nop
    5cf2:       bf00            nop
    5cf4:       bf00            nop
    5cf6:       d201            bcs.n   5cfc <(main.Device).WriteByte+0x52>
    5cf8:       f8c3 c000       str.w   ip, [r3]
    5cfc:       bf00            nop
    5cfe:       bf00            nop
    5d00:       bf00            nop
    5d02:       bf00            nop
    5d04:       bf00            nop
    5d06:       bf00            nop
    5d08:       bf00            nop
    5d0a:       bf00            nop
    5d0c:       bf00            nop
    5d0e:       bf00            nop
    5d10:       bf00            nop
    5d12:       bf00            nop
    5d14:       bf00            nop
    5d16:       bf00            nop
    5d18:       bf00            nop
    5d1a:       bf00            nop
    5d1c:       bf00            nop
    5d1e:       bf00            nop
    5d20:       bf00            nop
    5d22:       bf00            nop
    5d24:       bf00            nop
    5d26:       bf00            nop
    5d28:       bf00            nop
    5d2a:       bf00            nop
    5d2c:       bf00            nop
    5d2e:       bf00            nop
    5d30:       bf00            nop
    5d32:       bf00            nop
    5d34:       bf00            nop
    5d36:       bf00            nop
    5d38:       bf00            nop
    5d3a:       bf00            nop
    5d3c:       bf00            nop
    5d3e:       bf00            nop
    5d40:       bf00            nop
    5d42:       bf00            nop
    5d44:       bf00            nop
    5d46:       bf00            nop
    5d48:       bf00            nop
    5d4a:       f8c3 c000       str.w   ip, [r3]
    5d4e:       bf00            nop
    5d50:       bf00            nop
    5d52:       bf00            nop
    5d54:       bf00            nop
    5d56:       bf00            nop
    5d58:       bf00            nop
    5d5a:       bf00            nop
    5d5c:       bf00            nop
    5d5e:       bf00            nop
    5d60:       bf00            nop
    5d62:       bf00            nop
    5d64:       bf00            nop
    5d66:       bf00            nop
    5d68:       bf00            nop
    5d6a:       bf00            nop
    5d6c:       bf00            nop
    5d6e:       bf00            nop
    5d70:       bf00            nop
    5d72:       bf00            nop
    5d74:       bf00            nop
    5d76:       bf00            nop
    5d78:       bf00            nop
    5d7a:       bf00            nop
    5d7c:       bf00            nop
    5d7e:       bf00            nop
    5d80:       bf00            nop
    5d82:       bf00            nop
    5d84:       bf00            nop
    5d86:       bf00            nop
    5d88:       bf00            nop
    5d8a:       bf00            nop
    5d8c:       bf00            nop
    5d8e:       bf00            nop
    5d90:       bf00            nop
    5d92:       bf00            nop
    5d94:       bf00            nop
    5d96:       bf00            nop
    5d98:       bf00            nop
    5d9a:       bf00            nop
    5d9c:       bf00            nop
    5d9e:       bf00            nop
    5da0:       bf00            nop
    5da2:       bf00            nop
    5da4:       bf00            nop
    5da6:       bf00            nop
    5da8:       bf00            nop
    5daa:       f1bc 0c01       subs.w  ip, ip, #1
    5dae:       d18a            bne.n   5cc6 <(main.Device).WriteByte+0x1c>
    5db0:       f381 8810       msr     PRIMASK, r1
    5db4:       4770            bx      lr

Note that in this case, the optimizer figured out it did not need to call machine as the information was available at compile time, so it used constants. The problem appears to be, in this second case, it is attempting to use the ip register for both the count down from 8 to 0 [ see 0x5daa - 0x5dae], AND as the pointer to the register that contain the I/O register [see 0x5cf8 and 0x5d4a]

aykevl commented 3 years ago

I can confirm the issue. I think the issue is that maskSet, maskClear and i are all the constant 8 and the optimizer recognizes this and puts them all in the same register. That would be fine, if the values were indeed treated as constants. But while maskSet and maskClear are constant (and merging them is a good idea), i is not. This leads to the miscompilation.

Right now the inline assembly format does not support marking certain input values as clobbered. I'm not sure whether I really want to add such support, AsmFull is already pretty complicated. Instead, it might be better to move these snippets of inline assembly to C (usable through CGo) so that all the features of GCC style inline assembly can be used (including marking some inputs as clobbered). CGo might sound expensive, but in TinyGo it's just a regular function call and I have plans of optimizing it further so that it is optimized together with Go code (and can be inlined, etc, see #1707 for some related work).

aykevl commented 3 years ago

Also, I just want to say thank you for this super helpful bug report. It's a very clear description of the problem, with a reduced code sample even. It makes finding the bug a whole lot easier :)

ewwaller commented 3 years ago

Thanks, and my pleasure. I am new to go, but have spent my professional career with microcontrollers and embedded systems. I am new to go, but am increasingly excited about tinygo.

ewwaller commented 3 years ago

Edit: Never mind, Split the function to a *.c file, created a .h file, and included the .h instead of the function. That linked, and is cleaner


Original post: :

I have been playing with this, and have hit a roadblock with CGo and could use some help getting back on the tracks.

Here is my code for the ws2812 driver:

// +build atsamd51

package ws2812

// This file implements the WS2812 protocol for 120MHz Cortex-M4
// microcontrollers.
// Note: This implementation does not work with tinygo 0.9.0 or older.

import (
    "device/arm"
)

/*
#include <stdint.h>
void writebyte( unsigned char c, \
                uint32_t maskSet, \
                uint32_t maskClear, \
                uint32_t* portSet, \
                uint32_t* portClear){
   int cnt;
    int i;
    for (i = 7 ; i != 0xff ; i--) {
        if ((1<<i) & c) {
            cnt=4;
            while (cnt--);
            *portClear = maskClear;
            cnt=8;
            while (cnt--);
        } else {
            cnt=8;
            while (cnt--);
            *portClear = maskClear;
            cnt=4;
            while (cnt--);
        }
        *portSet = maskSet;
        *portClear = maskClear;
    }
 }
*/
import "C"

// Send a single byte using the WS2812 protocol.
func (d Device) WriteByte(c byte) error {
    // For the Cortex-M4 at 120MHz
    portSet, maskSet := d.Pin.PortMaskSet()
    portClear, maskClear := d.Pin.PortMaskClear()
    // See:
    // https://wp.josh.com/2014/05/13/ws2812-neopixels-are-not-so-finicky-once-you-get-to-know-them/
    // T0H: 32-34   cycles or  266.67ns -  283.33ns
    // T0L: 101-103 cycles or  841.67ns -  858.33ns
    //   +: 133-137 cycles or 1108.33ns - 1141.67ns
    // T1H: 73-75   cycles or  608.33ns -  625.00ns
    // T1L: 58-60   cycles or  483.33ns -  500.00ns
    //   +: 131-135 cycles or 1091.67ns - 1125.00ns

    mask := arm.DisableInterrupts()
    C.writebyte(
        C.uchar(c),
        C.uint32_t(maskSet),
        C.uint32_t(maskClear),
        portSet,
        portClear)
    arm.EnableInterrupts(mask)
    return nil
}

but, I get this when I build it:

ld.lld: error: undefined symbol: writebyte
>>> referenced by ws2812_m4_120m.go:58 (/home/ewaller/devel/go/src/ewaller.local/featherBlink/drivers/ws2812/ws2812_m4_120m.go:58)
>>>               /tmp/tinygo887064602/main.o:((featherBlink/drivers/ws2812.Device).WriteByte)
error: failed to link /tmp/tinygo887064602/main: exit status 1
ewaller@odin/home/ewaller/devel/go/src/ewaller.local/featherBlink[1] % 

It compiles, but dies at the linker. Not sure where to go from here.