knolleary / pubsubclient

A client library for the Arduino Ethernet Shield that provides support for MQTT.
http://pubsubclient.knolleary.net/
MIT License
3.84k stars 1.47k forks source link

ESP8266 fails to receive messages it is subscribed to after a while #458

Open juichi opened 6 years ago

juichi commented 6 years ago

Hello,

Thank you for the awesome work.

In a several of my projects I have observed that my clients, all ESP8266s, will fail to either receive of process messages after a while. The time period varies from 12 - 18 hours before the boards stop responding. I have tried adding a 100 ms delay before calling clients in to the client.loop(), it may of helped but the clients still failed to receive subscribed messages. A reset of the ESP fixes the issue.

I wrote some barebones code around my typical template, where the ESP8266 will publish a message if it has not heard from the broker on a query topic. This was running for about a day and I caught it failed this morning. Here is the barebones code:

#include <ESP8266WiFi.h>
#include <ESP8266mDNS.h>
#include <DNSServer.h>
#include <ESP8266WebServer.h>
#include <WiFiUdp.h>
#include <ArduinoOTA.h>
#include <PubSubClient.h>
#include <WiFiManager.h> 

#define mqtt_server "192.168.1.200"
WiFiClient espClient;
PubSubClient client(espClient);

#define mqtt_heartbeat "test/heartbeat"
#define ota_message "test/OTA"
#define ota_status "test/ota"
#define mqtt_subcribefail "test/mqttsubfail"
#define mqtt_query "test/query"
#define MQTT_ID "ESP MQTT TEST"

boolean OTAon;
int heartbeat, lastheartbeat;
long mqtt_check_timer, last_heartbeat_time;

void setup() {
  Serial.begin(115200);
   delay(10);

  //WiFi Manager because it's awesome

  WiFiManager wifiManager;
 // wifiManager.resetSettings();
  wifiManager.autoConnect("AutoConnectAP");

  // Port defaults to 8266
  // ArduinoOTA.setPort(8266);

  // Hostname defaults to esp8266-[ChipID]
  ArduinoOTA.setHostname("ESP Subscribe Test");

  // No authentication by default
  // ArduinoOTA.setPassword((const char *)"123"); // password

  ArduinoOTA.onStart([]() {
   // digitalWrite(led,HIGH);
    Serial.println("Start");
  });
  ArduinoOTA.onEnd([]() {
    //digitalWrite(led,LOW);
    Serial.println("\nEnd");
  });
  ArduinoOTA.onProgress([](unsigned int progress, unsigned int total) {
    Serial.printf("Progress: %u%%\r", (progress / (total / 100)));
    int percent =  (progress / (total / 100));

  });
  ArduinoOTA.onError([](ota_error_t error) {
    Serial.printf("Error[%u]: ", error);
    if (error == OTA_AUTH_ERROR) Serial.println("Auth Failed");
    else if (error == OTA_BEGIN_ERROR) Serial.println("Begin Failed");
    else if (error == OTA_CONNECT_ERROR) Serial.println("Connect Failed");
    else if (error == OTA_RECEIVE_ERROR) Serial.println("Receive Failed");
    else if (error == OTA_END_ERROR) Serial.println("End Failed");
  });
  ArduinoOTA.begin();
  Serial.println("Ready");
  Serial.print("IP address: ");
  Serial.println(WiFi.localIP());

  //MQTT connection shit with subscription callback function
  client.setServer(mqtt_server, 1883);
  //set call back for MQTT messages
  client.setCallback(callback);

 while (!client.connected()) {
   Serial.println("Connecting to MQTT...");
    if (client.connect(MQTT_ID)) {
        Serial.println("connected");  

    } else {

       Serial.print("failed with state ");
       Serial.print(client.state());
       delay(2000);
     }
  }  

  client.subscribe(ota_message);
  client.subscribe(mqtt_query);
  client.publish(ota_message, "off", true);
  client.publish(ota_status, "off", true);

  heartbeat = 0;
  mqtt_check_timer = millis();
}

void loop() {

  client.loop();

  if (!client.connected()) {
    reconnect();
  }

 //This function turns on OTA abilities.
 if(OTAon){
 ArduinoOTA.handle();
 }

 //Check that the subscribe function is working
  if(millis() - mqtt_check_timer > 60000){   //I don't want this kicking off more than once a minute
    mqtt_check_timer = millis();
    if(millis() - last_heartbeat_time < 30000 ){
      client.publish(mqtt_subcribefail, "off", true);
    }
    if(millis() - last_heartbeat_time > 30000 ){
      client.publish(mqtt_subcribefail, "on", true);
    }

  }

}

//Here is the MQTT callback function.

void callback(char* topic, byte* payload, unsigned int length) {

  char message[length+1];

  //Parse & Payload into array
  for (int i = length; i >= 0; i--) {
   message[i] = (char)payload[i];
  }

  //add null character to make string -- maybe I don't need to do this and cast payload as char and do strcmp
  message[length] = '\0';

//#define heartbeat "test/heartbeat"
//#define ota_message "test/OTA"
//#define ota_status "test/ota"
//#define mqtt_subcribefail "test/mqttsubfail"
//#define mqtt_query "test/query"
//#define MQTT_ID "ESP MQTT TEST"

  Serial.println((char)message[0]);
  String payloadstring = String(message);
  Serial.println("Here's the payload!");
  Serial.println(payloadstring);

  if(strcmp(topic,ota_message) == 0 && payloadstring == "on"){
    OTAon = true;
    Serial.println("Turning on OTA function per MQTT message");
    client.publish(ota_status, "on", true);
  }

  if(strcmp(topic,ota_message) == 0 && payloadstring == "off"){
    OTAon = false;
    Serial.println("Turning off OTA function per MQTT message");
    client.publish(ota_status, "off", true);
  }

  if(strcmp(topic,mqtt_query) == 0){        //don't care what the payload is
     client.publish(mqtt_heartbeat, String(heartbeat).c_str(), true);
     last_heartbeat_time = millis();
     heartbeat++;
  }
}

void reconnect() {
  // Loop until we're reconnected
  while (!client.connected()) {
    Serial.print("Attempting MQTT connection...");
    // Attempt to connect
    // If you do not want to use a username and password, change next line to
    // if (client.connect("ESP8266Client")) {
    if (client.connect(MQTT_ID)) {
      Serial.println("connected");
    } else {
      Serial.print("failed, rc=");
      Serial.print(client.state());
      Serial.println(" try again in 5 seconds");
      // Wait 5 seconds before retrying
      delay(5000);
    }
  }
}

Thanks for looking, Neal

juichi commented 6 years ago

I have since added an MQTT publish at the topic of the callback routine, since the client.loop() should pass the new message on to the callback and that routine will run. The idea here is inspect whether the problem is callback or is underlying within the client.loop() functions. I still have the heartbeat checks running and when they stall I should be able to see if the ESP even makes it to the call back function, that is messages will not be sent from the main call back or after the topic down select. Guess I will find out in about a day.

Thanks, Neal

juichi commented 6 years ago

About 16 hours ago, the ESP stopped entering the callback loop. This was confirmed by a publish command in the callback loop that did not send a message and the ESP noting that it has not updated it's heart beat in the code above.

I have not had much time to examine the issue at all, but it is clear that the callback loop for subscribed messages is not getting called after a while. I will take some more time this weekend to troubleshoot this, but I am curious, is there a constraint against having more than one topic subscribed? I am thinking surely not. I am leaning to some sort of resource glitch or a blocking process/interrupt on a super low level that breaks the client.loop() somewhere for subscriptions as publish is not affected.

For now, I will experiment with detecting this failure adding in some persistence timers, then unsubscribing and disconnecting the client from the MQTT broker. After a little break I can have the device reconnect and resubscribe. Or I could just have the esp reset itself when it detects the fault. Neither of which options are ideal.

Anyways, I know this is a bit of a ramble and, of course, I could be doing something completely stupid.

Thanks for the help and the awesome work! Neal

TD-er commented 6 years ago

Did you find something new on this topic? At ESPeasy we get similar reports like this one.

juichi commented 6 years ago

I have implemented a heart beat check routine where the ESP is polled by the server with an mqtt message. And if the ESP does not hear from the server for 1 minute, the ESP will reboot itself. It works after the reboot. That's the best I can do.

cifarellispa commented 5 years ago

Had the same issue. You can reproduce issue simply disconnecting and reconnecting periodically. After reconnection you loose any subscription. Apparently solved setting the CleanSession flag to FALSE during connect

HugoML commented 5 years ago

Any progress on this? This is a major setback for this library. ESP restart takes quite some time to reconnect to wifi and subscribe to a topic. I prefer not to restart ESP just because MQTT message is not received. pubsubclient.disconnect() and reconnect seems not working well. The error message is -2 and does not get connected. My pubsubclient connects well for a couple of hours and then it still can publish messages to the topic, just canNot receive messages from the topic anymore.

@knolleary could you please take a look at this? Many thanks!

chaseTfreeman commented 4 years ago

Hello I'm having similar issues and hoping someone has a working example. My issue is that ".subscribe" is not calling the callback. I am seeing a "1" returned when I print the result of the subscribe.

I am using pubsubclient, wificlientsecure, and wifimanager. I'm using AWS as my MQTT server. I have no problem publishing but cannot subscribe successfully.

include

#include <PubSubClient.h>
#include <NTPClient.h>
#include <WiFiUdp.h>
#include <WiFiManager.h> //https://github.com/tzapu/WiFiManager

WiFiUDP ntpUDP; 
NTPClient timeClient(ntpUDP, "pool.ntp.org");

// Update these with values suitable for your network.

const char* AWS_endpoint = "[aws_string_censored]-ats.iot.us-east-1.amazonaws.com";

WiFiClientSecure espClient;
PubSubClient client(espClient);
long lastMsg = 0;
char msg[50];
int value = 0;

void setup_wifi() {

delay(10);

//Local intialization. Once its business is done, there is no need to keep it around
WiFiManager wifiManager;

//fetches ssid and pass from eeprom and tries to connect
//if it does not connect it starts an access point with the specified name
//here  "AutoConnectAP"
//and goes into a blocking loop awaiting configuration
//or use this for auto generated name ESP + ChipID
wifiManager.autoConnect();

//if you get here you have connected to the WiFi
Serial.println("connected...yeey :)");
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.print(".");

}

randomSeed(micros());

Serial.println("");
Serial.println("WiFi connected");
Serial.println("IP address: ");
Serial.println(WiFi.localIP());
}

void callback(char* topic, byte* payload, unsigned int length) {
// Switch on the LED if an 1 was received as first character
if ((char)payload[0] == '1') {
    digitalWrite(BUILTIN_LED, LOW);   // Turn the LED on (Note that LOW is the voltage level
    // but actually the LED is on; this is because
    // it is active low on the ESP)
    client.publish("outTopic", "1");
} else {
    digitalWrite(BUILTIN_LED, HIGH);  // Turn the LED off by making the voltage HIGH
    client.publish("outTopic", "0");
}

}

void reconnect() {
// Loop until we're reconnected
while (!client.connected()) {
    Serial.print("Attempting MQTT connection...");
    // Create a random client ID
    String clientId = "ESP8266Client-";
    clientId += String(random(0xffff), HEX);
    // Attempt to connect
    if (client.connect(clientId.c_str(),"username","password")) {
    Serial.println("connected");
    //   Once connected, publish an announcement...
    client.publish("outTopic", "hello world");
    ... and resubscribe
    //   print 1 if client is subscribes succesfully
    Serial.println(client.subscribe("outTopic"));
    client.subscribe("outTopic"); //<<<<<<<<<<<< This is where it fails to call the callback.
    } else {
    Serial.print("failed, rc=");
    Serial.print(client.state());
    Serial.println(" try again in 5 seconds");
    // Wait 5 seconds before retrying
    delay(5000);
    }
}
}

void setup() {
pinMode(BUILTIN_LED, OUTPUT);     // Initialize the BUILTIN_LED pin as an output
Serial.begin(115200);

setup_wifi();

timeClient.begin();
while(!timeClient.update()){
    timeClient.forceUpdate();
}

espClient.setX509Time(timeClient.getEpochTime());

delay(1000);
if (!SPIFFS.begin()) {
    Serial.println("Failed to mount file system");
    return;
}
Serial.print("Heap: "); Serial.println(ESP.getFreeHeap());

// Load certificate file
File cert = SPIFFS.open("/cert.der", "r"); //replace cert.crt eith your uploaded file name
if (!cert) {
    Serial.println("Failed to open cert file");
}
else
    Serial.println("Success to open cert file");

delay(1000);

if (espClient.loadCertificate(cert))
    Serial.println("cert loaded");
else
    Serial.println("cert not loaded");

// Load private key file
File private_key = SPIFFS.open("/private.der", "r"); //replace private eith your uploaded file name
if (!private_key) {
    Serial.println("Failed to open private cert file");
}
else
    Serial.println("Success to open private cert file");

delay(1000);

if (espClient.loadPrivateKey(private_key))
    Serial.println("private key loaded");
else
    Serial.println("private key not loaded");

    // Load CA file
    File ca = SPIFFS.open("/ca.der", "r"); //replace ca eith your uploaded file name
    if (!ca) {
    Serial.println("Failed to open ca ");
    }
    else
    Serial.println("Success to open ca");

    delay(1000);

    if(espClient.loadCACert(ca))
    Serial.println("ca loaded");
    else
    Serial.println("ca failed");

Serial.print("Heap: "); Serial.println(ESP.getFreeHeap());

client.setServer(AWS_endpoint, 8883);
client.setCallback(callback);
}

void loop() {

if (!client.connected()) {
    reconnect();
}
client.loop();

}
JJFourie commented 3 years ago

I'm experiencing the same when using PubSubClient in combination with a ESP32-Cam. After a couple of hours the callback code fails to react to any subscribed MQTT topics, while all other functionality including publishing topics remains fully working. A simple reboot resets the problem.

Does support on this library still exist?

nambabwe commented 3 years ago

Are there any clues in the MQTT server's log?

Groete

GM

On Tue, Dec 22, 2020, 17:18 Johan Fourie notifications@github.com wrote:

I'm experiencing the same when using PubSubClient in combination with a ESP32-Cam. After a couple of hours the callback code fails to react to any subscribed MQTT topics, while all other functionality including publishing topics remains fully working. A simple reboot resets the problem.

Does support on this library still exist?

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/knolleary/pubsubclient/issues/458#issuecomment-749831069, or unsubscribe https://github.com/notifications/unsubscribe-auth/ADEX2XQPFANEASMBSCCXOBLSWESNRANCNFSM4FHCAO2Q .

suleymanorun commented 3 years ago

My problem was the buffer size MQTTClient.h -> explicit MQTTClient(int bufSize = 128); "128 " to "256" problem solved.