pycom / pycom-micropython-sigfox

A fork of MicroPython with the ESP32 port customized to run on Pycom's IoT multi-network modules.
MIT License
196 stars 167 forks source link

LoRaMesh crashes after several iterations (10 mins aprox.) - Core Dump Error #611

Open CIRCk8 opened 1 year ago

CIRCk8 commented 1 year ago

Hi,

We are using FiPy4, with v1.20.0.rc10[legacy] version, for a LoRaMesh project.

After several iterations, peer's execution breaks into a core dump error. After that error, peer goes to a reset loop.

We share our peer's logs

[main()] Running server comm. task
[main()] Sending List-Of-Nodes data to platform
[main()] Running heartbeat comm. task
[main()] Running garbage cleaner (in heap)
[MemInfoPcg()] Free [%]: 99.038
[MemInfoPcg()] Alloc [%]: 0.9632443
[main()] Reading LoRa - Mesh UDP port ...
[main()] Running Thread-Over-LoRa task
[main()] Running Led-Alive
[main()] Running mesh state update
[main()] 1674218960: PYMESH_ROLE_LEADER, IP: fdde:ad00:beef:0:e87:4321:90c8:6432 [not singleton]
[main()] Verifying if neighbours are alive (via lora ping)
[main()] 1674218960: neighbors = 4; IPv6 list: ['fdde:ad00:beef:0:0:ff:fe00:807', 'fdde:ad00:beef:0:0:ff:fe00:9800', 'fdde:ad00:beef:0:0:ff:fe00:b800', 'fdde:ad00:beef:0:0:ff:fe00:bc00']
Guru Meditation Error: Core  1 panic'ed (LoadProhibited). Exception was unhandled.
Core 1 register dump:
PC      : 0x401f7032  PS      : 0x00060530  A0      : 0x801f7164  A1      : 0x3ffb4230  
A2      : 0x0000060c  A3      : 0x4011d818  A4      : 0x0012191d  A5      : 0x00000272  
A6      : 0x00000003  A7      : 0x00060023  A8      : 0x801f7028  A9      : 0x3ffb4210  
A10     : 0x1d191200  A11     : 0x00000000  A12     : 0x00000001  A13     : 0x0000abab  
A14     : 0x3ffc1908  A15     : 0x00000001  SAR     : 0x00000010  EXCCAUSE: 0x0000001c  
EXCVADDR: 0x00000700  LBEG    : 0x4009d844  LEND    : 0x4009d872  LCOUNT  : 0xffffffff   Backtrace: 0x401f7032:0x3ffb4230 0x401f7161:0x3ffb4260 0x401d8fd1:0x3ffb4280 0x401d8ff6:0x3ffb42a0 0x4011d7d7:0x3ffb42c0 0x401089a5:0x3ffb42e0 ================= CORE DUMP START =================

Checking core dump backtraces, we explore that the problem is in modlora.c and openthread lib. Our output of decoded backtrace:

circe@circe-Vostro-5471:~/repos/esp32-loramesh-test/xtensa-esp32-elf/bin$ ./xtensa-esp32-elf-addr2line -fe ~/Descargas/FiPy-1.20.0.rc10-application.elf  0x401f7032:0x3ffb4230 0x401f7161:0x3ffb4260 0x401d8fd1:0x3ffb4280 0x401d8ff6:0x3ffb42a0 0x4011d7d7:0x3ffb42c0 0x401089a5:0x3ffb42e0
_ZN2ot3Cli11Interpreter15HandlePingTimerEv
/Users/iwahdan/esp/pycom-esp-idf/components/openthread/src/cli/cli.cpp:1971
_ZN2ot3Cli11Interpreter17s_HandlePingTimerERNS_5TimerE
/Users/iwahdan/esp/pycom-esp-idf/components/openthread/src/cli/cli.cpp:1960
_ZN2ot14TimerScheduler13ProcessTimersERKNS0_8AlarmApiE
/Users/iwahdan/esp/pycom-esp-idf/components/openthread/src/common/timer.hpp:132
otPlatAlarmMilliFired
/Users/iwahdan/esp/pycom-esp-idf/components/openthread/src/common/timer.hpp:379
alarmCallback
/var/jenkins_home/workspace/opython-sigfox_release-candidate@2/esp32/lora/otplat_alarm.c:44
TASK_LoRa_Timer
/var/jenkins_home/workspace/opython-sigfox_release-candidate@2/esp32/mods/modlora.c:1169

Our main.py is:

from lorameshlib import Loramesh
from NodeDataClass import NodeData
from ListOfNodesClass import ListOfNodes
from ServerCommClass import ServerComm
from network import LoRa
import time
import socket
import ubinascii
import pycom
import machine
import os
import gc
import json

########################################
# CONSTANT & PARAMS
########################################
OK_FLAG = 'ok'
NOK_FLAG = 'nok'
UDPPORT = 1234
PYMESHSTATE = ["PYMESH_ROLE_DISABLED", "PYMESH_ROLE_DETACHED", "PYMESH_ROLE_CHILD", "PYMESH_ROLE_ROUTER", "PYMESH_ROLE_LEADER"]
MESHSTATE = {"offline": "PYMESH_ROLE_OFFLINE", "disabled": "PYMESH_ROLE_DISABLED", "detached": "PYMESH_ROLE_DETACHED", "child": "PYMESH_ROLE_CHILD", "router": "PYMESH_ROLE_ROUTER", "leader": "PYMESH_ROLE_LEADER"}
MAX_NODES_FOR_HUB = 10

##########################################
# Function to parse peer's packet
#
# Returns (Mac,PacketNum, ListOfTemps)
##########################################
def parseNodeData(Packet):
    PeerMac = ''
    PacketNum = 0
    PeerTemps = ''
    PeerName = ''

    # Create json object
    #print(Packet)
    PacketAsObj = json.loads(Packet)

    # Parse
    try:
        PeerMac = PacketAsObj['Mac-Nodo']
        PacketNum = PacketAsObj['Packet_t']
        PeerTemps = str(PacketAsObj['Temps'])
        PeerName = PacketAsObj['NOMBRE']
    except:
        pass
    return (PeerMac, PacketNum, PeerTemps, PeerName)

##########################################
# Disable default pycom settings
##########################################
def DisableDefaultCnf():
    # Disable some default staff from pycom
    print("[DisableDefaultCnf()] Quit default PyCom cnf.")
    pycom.wifi_on_boot(False)
    pycom.lte_modem_en_on_boot(False)
    pycom.heartbeat(False)

##########################################
# Configure THREAD over lora
##########################################
def SetupLoRa():
    # Config LoRa 
    print("[SetupLoRa()] Setting LoRa Interface ...")
    try:
        lora = LoRa(mode=LoRa.LORA, region=LoRa.EU868,  tx_power=20, bandwidth=LoRa.BW_250KHZ, sf=7)
    except Exception as e:
        print("[SetupLoRa()] DISASTER! exception opening Lora: "+str(e)+"... retrying.")
        time.sleep(2)
        print("")

    # Get MAC of hub
    mac = str(ubinascii.hexlify(lora.mac()))[2:-1]
    print("[SetupLoRa()] LoRa Interface is cool with ", mac, " mac.")
    print("[SetupLoRa()] Configuring THREAD mesh protocol over LoRa Interface.")
    # Config LoRaMesh
    mymesh = Loramesh(lora)

    # Waiting until mesh comm.
    # exits when either "PYMESH_ROLE_CHILD", "PYMESH_ROLE_ROUTER" or "PYMESH_ROLE_LEADER"
    cstate = mymesh._state_update()
    while (cstate<2):      
        mymesh.led_state()
        print("[SetupLoRa()] %d: %s ... waiting." % (time.time(), PYMESHSTATE[cstate]))
        time.sleep(2)
        cstate = mymesh._state_update()

    # Set Hub as LEADER of MESH
    #mymesh.state = Loramesh.STATE_LEADER # As mesh leader

    node_ip_addr = mymesh.ip()
    print("[SetupLoRa()] %d: connected as %s. IP: %s" % (time.time(), PYMESHSTATE[cstate], node_ip_addr))

    # UDP socket bind in 1234 port
    s = socket.socket(socket.AF_LORA, socket.SOCK_RAW)
    s.bind(UDPPORT)
    # Set callback for LoRa 
    #mymesh.mesh.rx_cb(receiveLoRaPackets, s)
    # Log
    print("[SetupLoRa()] %d: Binding LoRa socket to %d port." % (time.time(), UDPPORT))

    return (mymesh,s)

##########################################
# Ram status in bytes
##########################################
def MemInfoBytes():
    mtotal = gc.mem_alloc() +  gc.mem_free()
    print('[MemInfoBytes()] Free [Bytes]: ' + str(gc.mem_alloc()))
    print('[MemInfoBytes()] Alloc [Bytes]: ' + str(gc.mem_free()))
    print('[MemInfoBytes()] Total [Bytes]: ' + str(mtotal))

##########################################
# Ram status in %
##########################################
def MemInfoPcg():
    mtotal = gc.mem_alloc() +  gc.mem_free()
    mfree = 100*(gc.mem_free()/mtotal)
    malloc = 100*(gc.mem_alloc()/mtotal)
    print('[MemInfoPcg()] Free [%]: ' + str(mfree))
    print('[MemInfoPcg()] Alloc [%]: ' + str(malloc))

##########################################
# Garbage cleaner (Heap cleaning)
##########################################
def GarbageCleaner():
    gc.collect()
    gc.threshold(gc.mem_free())

########################################
# Global vars
########################################
# For using in fcircelab
MyComm = ServerComm('192.168.90.100',8888) 
# For using in home
#MyComm = ServerComm('192.168.0.18',8888) 
# For using with WLAN/ETH Bridge
#MyComm = ServerComm('192.168.1.1',8888) 
MyNodes = ListOfNodes(MAX_NODES_FOR_HUB)
MyMacAddress = ''

# Disable default cnf
DisableDefaultCnf()

# Connect to router wifi
MyComm.ConnectToWifi(120)

# Connect to ntp server
MyComm.ConnectToNTP()
print("[main()] Date: ", time.localtime())

# Setup thread mesh over lora
(mesh,lorasock) = SetupLoRa()

# Get current ipv6 in mesh and set timer
last_ip = mesh.ip()

# Setup timers
last_comm_tt = time.time()
last_mesh_tt = time.time()
last_hb_tt = time.time()
last_gc_tt = time.time()
########################################
# Loop
########################################
while True:

    # Send data to server in case of data 
    if (time.time() - last_mesh_tt) > 0.05:
        # Check if some node send data to the mesh
        print('[main()] Reading LoRa - Mesh UDP port ...')
        RawData, PeerAddr = lorasock.recvfrom(128)
        if len(RawData) > 0:

            # Get ip and port
            PeerIP = PeerAddr[0]
            PeerPort = PeerAddr[1]

            # Log received LoRa packet
            print('[main()] Incomming %d bytes from %s (port %d)'%(len(RawData), PeerIP, PeerPort))
            print('[main()] %s'%(RawData.decode("utf-8")))

            # Parse data
            ParsedData = parseNodeData(RawData.decode("utf-8"))
            #print("[receiveLoRaPackets()] Node - MAC:", ParsedData[0])
            #print("[receiveLoRaPackets()] Node - Packet Num", ParsedData[1])
            #print("[receiveLoRaPackets()] Node - Temp. Array", ParsedData[2])
            print("[main()] Message was parsed.")

            # Create NodeData Object
            Node = NodeData(ParsedData[0])

            # Check if exist in list
            IsNodeInList = MyNodes.FromNodeMacToListIdx(ParsedData[0])

            # Inject to list (Check if edit existing entry or add new entry)
            if IsNodeInList == NOK_FLAG:
                # Edit object info
                Node.SetLastTimeOfComm(time.time())
                Node.SetTemperaturesOfNode(ParsedData[2])
                Node.SetNameOfNode(ParsedData[3])
                # Inject
                MyNodes.AddNodeToList(Node)
                print("[main()] New node and its data added to hub's list")
            else:
                # Edit object info
                Node.SetLastTimeOfComm(time.time())
                Node.SetTemperaturesOfNode(ParsedData[2])
                Node.SetNameOfNode(ParsedData[3])
                # Inject
                print(IsNodeInList)
                if IsNodeInList is None:
                    MyNodes.AddNodeToList(Node)
                    print("[main()] New node and its data added to hub's list")
                else:
                    MyNodes.EditNodeFromList(IsNodeInList,Node)
                    print("[main()] Node data edited from hub's list")

            # Quit object
            del Node

        print('[main()] Running Thread-Over-LoRa task')
        # Led alive
        print('[main()] Running Led-Alive')
        pycom.rgbled(0xFF0000)
        time.sleep(0.1)
        pycom.rgbled(0x000000) 

        # Update my state in LoRaMesh
        print('[main()] Running mesh state update')
        try:
            mesh.led_state()
            c_state = mesh.cli('state')
            if "true" in mesh.cli('singleton'):
                single = "is singleton"
            else:
                single = "not singleton"
            print("[main()] %d: %s, IP: %s [%s]" % (time.time(), MESHSTATE[c_state], last_ip, single))
        except:
                pass

        # Update my ipv6
        if mesh.ip() != last_ip:
            print("[main()] ipv6 changed from: %s to %s"%(last_ip, mesh.ip()))
            last_ip = mesh.ip()

        # Update list of neighbors
        neigbors = mesh.neighbors_ip()
        print('[main()] Verifying if neighbours are alive (via lora ping)')
        print("[main()] %d: neighbors = %d; IPv6 list: %s" % (time.time(), len(neigbors), neigbors))
        # Test ping with every neighbour
        for neighbor in neigbors:
            try:
                tping = mesh.ping(neighbor)
                if tping > 0:
                    print('[main()] %d: ping OK from neighbor %s' % (time.time(), neighbor))
                    # mesh.blink(10, .1)
                else:
                    print('[main()] %d: ping not received from neighbor %s' % (time.time(), neighbor))
            except Exception as e:
                print("[main()] exception doing PING: "+str(e))
                pass
        nbtab = mesh.cli('neighbor table')
        print('[main()] Current network status \r\n' + nbtab)

        # Updte timer
        last_mesh_tt = time.time()

    # Send data to server in case of data 
    if (time.time() - last_comm_tt) > 20:
        print('[main()] Running server comm. task')
        # Send data 
        if MyNodes.NumberOfNodes > 0:
            # Print
            print("[main()] Sending List-Of-Nodes data to platform")
            # Data as json
            HubData = MyNodes.GetListAsJsonFmt()
            # Send data to platform (wifi)
            MyComm.PostToServer('/upload', HubData)
        # Update timer
        last_comm_tt = time.time()

    # Send data to server in case of data 
    if (time.time() - last_hb_tt) > 20:
        print('[main()] Running heartbeat comm. task')
        # POST Heartbeat
        MyComm.HeartBeat()
        # Update timer
        last_hb_tt = time.time()

    # Send data to server in case of data 
    if (time.time() - last_gc_tt) > 2:
        print('[main()] Running garbage cleaner (in heap)')
        # Clean heap
        GarbageCleaner()
        # Check mem and print
        MemInfoPcg()
        # Send mem usage to server
        mtotal = gc.mem_alloc() +  gc.mem_free()
        mfree = 100*(gc.mem_free()/mtotal)
        malloc = 100*(gc.mem_alloc()/mtotal)
        MyComm.PostRamUsage(mfree, malloc)
        # Update timer
        last_gc_tt = time.time()

Our lorameshlib is:

#!/usr/bin/env python
#
# Copyright (c) 2019, Pycom Limited.
#
# This software is licensed under the GNU GPL version 3 or any
# later version, with permitted additional terms. For more information
# see the Pycom Licence v1.0 document supplied with this file, or
# available at https://www.pycom.io/opensource/licensing
#

from network import LoRa
import socket
import time
import utime
import ubinascii
import pycom

__version__ = '2'

class Loramesh:
    """ Class for using Lora Mesh - openThread """

    STATE_DISABLED = const(0)
    STATE_DETACHED = const(1)
    STATE_CHILD = const(2)
    STATE_ROUTER = const(3)
    STATE_LEADER = const(4)
    STATE_LEADER_SINGLE = const(5)

    # rgb LED color for each state: disabled, detached, child, router, leader and single leader
    RGBLED = [0x0A0000, 0x0A0000, 0x0A0A0A, 0x000A00, 0x0A000A, 0x000A0A]

    # address to be used for multicasting
    MULTICAST_MESH_ALL = 'ff03::1'
    MULTICAST_MESH_FTD = 'ff03::2'

    MULTICAST_LINK_ALL = 'ff02::1'
    MULTICAST_LINK_FTD = 'ff02::2'

    def __init__(self, lora=None):
        """ Constructor """
        if lora is None:
            self.lora = LoRa(mode=LoRa.LORA, region=LoRa.EU868, bandwidth=LoRa.BW_125KHZ, sf=7)
        else:
            self.lora = lora
        self.mesh = self.lora.Mesh()
        self.rloc = ''
        self.ip_eid = ''
        self.ip_link = ''
        self.single = True
        self.state = STATE_DISABLED
        self.ip_others = []

    def _state_update(self):
        """ Returns the Thread role """
        self.state = self.mesh.state()
        if self.state < 0:
            self.state = self.STATE_DISABLED
        return self.state

    def _rloc_ip_net_addr(self):
        """ returns the family part of RLOC IPv6, without last word (2B) """
        self.net_addr = ':'.join(self.rloc.split(':')[:-1]) + ':'
        return self.net_addr

    def _update_ips(self):
        """ Updates all the unicast IPv6 of the Thread interface """
        self.ip_others = []
        ips = self.mesh.ipaddr()
        self.rloc16 = self.mesh.rloc()
        for line in ips:
            if line.startswith('fd'):
                # Mesh-Local unicast IPv6
                try:
                    addr = int(line.split(':')[-1], 16)
                except Exception:
                    continue
                if addr == self.rloc16:
                    # found RLOC
                    # RLOC IPv6 has x:x:x:x:0:ff:fe00:RLOC16
                    self.rloc = line
                elif ':0:ff:fe00:' not in line:
                    # found Mesh-Local EID
                    self.ip_eid = line
            elif line.startswith('fe80'):
                # Link-Local
                self.ip_link = line
            else:
                self.ip_others.append(line)

    def is_connected(self):
        """ Returns true if it is connected as either Child, Router or Leader """
        connected = False
        self.state = self.mesh.state()
        if self.state in (STATE_CHILD, STATE_ROUTER, STATE_LEADER, STATE_LEADER_SINGLE):
            connected = True
        return connected

    def led_state(self):
        """ Sets the LED according to the Thread role """
        if self.state == STATE_LEADER and self.mesh.single():
            pycom.rgbled(self.RGBLED[self.STATE_LEADER_SINGLE])
        else:
            pycom.rgbled(self.RGBLED[self.state])

    # returns the IP ML-EID or the ip having this prefix
    def ip(self, prefix = None):
        """ Returns the IPv6 RLOC """
        ip = self._update_ips()
        if prefix is None:
            return self.ip_eid
        # we need to check al IPs from self.ip_others that may start with prefix
        p = prefix.split("::")[0]
        for ip in self.ip_others:
            if ip.startswith(p):
                return ip
        return None

    def neighbors(self):
        """ Returns a list with all properties of the neighbors """
        return self.mesh.neighbors()

    def neighbors_ip(self):
        """ Returns a list with IPv6 (as strings) of the neighbors """
        neighbors = self.neighbors()
        nei_list = []
        net_ip = self._rloc_ip_net_addr()
        if neighbors is not None:
            for nei_rec in neighbors:
                nei_ip = net_ip + hex(nei_rec.rloc16)[2:]
                nei_list.append(nei_ip)
        return nei_list

    def ipaddr(self):
        """ Returns a list with all unicast IPv6 """
        return self.mesh.ipaddr()

    def cli(self, command):
        """ Simple wrapper for OpenThread CLI """
        return self.mesh.cli(command)

    def ping(self, ip):
        """ Returns ping return time, to an IP """
        res = self.cli('ping ' + ip)
        # '8 bytes from fdde:ad00:beef:0:0:ff:fe00:e000: icmp_seq=2 hlim=64 time=236ms\r\n'
        # 'Error 6: Parse\r\n'
        # no answer
        ret_time = -1
        try:
            ret_time = int(res.split('time=')[1].split('ms')[0])
        except Exception:
            pass
        return ret_time

    def blink(self, num = 3, period = .5, color = None):
        """ LED blink """
        if color is None:
            color = self.RGBLED[self.state]
        for i in range(0, num):
            pycom.rgbled(0)
            time.sleep(period)
            pycom.rgbled(color)
            time.sleep(period)
            self.led_state()

Anyone has a similar issue?

Thank you!

CIRCk8 commented 1 year ago

@Xykon @elmasria