Replace docker's healthchecks with restart

piontkovsk11andre1 commented 1 year ago
Docker health checks are not suitable for a real development environment.
[x] Create a list of used health checks for future use with Nomad jobs.
[x] Replace the health check dependencies with "restart always".
Overrides #47
piontkovsk11andre1 commented 1 year ago
Containers healthchecks for a reference:
# NPM package publisher:
  npm-publish:
    healthcheck:
      test: wget -q --spider http://localhost:8080/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      verdaccio:
        condition: service_healthy
  # Client live-server (React):
  client:
    healthcheck:
      test: wget -q --spider http://localhost:80/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      verdaccio:
        condition: service_healthy
  # Client build (React):
  client-build:
    healthcheck:
      test: wget -q --spider http://localhost:8080/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 1m30s
    depends_on:
      npm-publish:
        condition: service_healthy
  # WebAPI container (Express.js):
  api-v1:
    healthcheck:
      test: wget -q --spider http://localhost:8080/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      verdaccio:
        condition: service_healthy
  # Jobs container (Node.js):
  jobs:
    healthcheck:
      test: wget -q --spider http://localhost:8080/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      verdaccio:
        condition: service_healthy
  # Login container (Express.js + Passport.js):
  login:
    healthcheck:
      test: wget -q --spider http://localhost:8080/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      verdaccio:
        condition: service_healthy
  # Views container (Express.js):
  views:
    healthcheck:
      test: wget -q --spider http://localhost:8080/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      verdaccio:
        condition: service_healthy
  # Ingress reverse proxy (Traefik):
  traefik:
    healthcheck:
      test: traefik ping --healthcheck >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      consul:
        condition: service_healthy
  # NPM repository (Verdaccio):
  verdaccio:
    healthcheck:
      test: wget --no-verbose --tries=1 --spider http://localhost:4873/-/ping >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # SQL database (Postgres):
  postgres:
    healthcheck:
      test: pg_isready -U postgres_user -d application >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Postgres prometheus metrics exporter:
  postgres-metrics:
    healthcheck:
      test: wget --no-verbose --tries=1 --spider http://localhost:9187/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      postgres:
        condition: service_healthy
  # OLAP database (ClickHouse):
  clickhouse:
    healthcheck:
      test: wget --no-verbose --tries=1 --spider http://localhost:8123/ping >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Message broker (RabbitMQ):
  rabbitmq:
    healthcheck:
      test: rabbitmq-diagnostics check_port_connectivity >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 1m
  # RabbitMQ metrics exporter:
  rabbitmq-metrics:
    depends_on:
      rabbitmq:
        condition: service_healthy
  # Document-oriented database (Mongo):
  mongo:
    healthcheck:
      test: mongosh --eval "db.adminCommand('ping')" || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Mongo metrics exporter:
  mongo-metrics:
    healthcheck:
      test: wget -q --spider http://localhost:9216/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      mongo:
        condition: service_healthy
  # Object storage engine (Minio):
  minio:
    healthcheck:
      test: curl --silent --fail http://localhost:9001/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Key-value storage (Redis):
  redis:
    healthcheck:
      test: redis-cli ping >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Redis metrics exporter:
  redis-metrics:
    healthcheck:
      test: wget -q --spider http://localhost:9121/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      redis:
        condition: service_healthy
  # SQL administration interface (Adminer):
  adminer:
    healthcheck:
      test: wget -q --spider http://localhost:8080/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      postgres:
        condition: service_healthy
  # Mongo administration interface (Mongo Express):
  mongo-express:
    healthcheck:
      test: wget -q --spider http://localhost:8081/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
        max-file: "5"
    depends_on:
      mongo:
        condition: service_healthy
  # Search engine (OpenSearch):
  opensearch:
    healthcheck:
      test: curl --silent --fail https://localhost:9200 -ku 'admin:admin' >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Opensearch dashboard:
  opensearch-dashboards:
    depends_on:
      opensearch:
        condition: service_healthy
  # Business intelligence tool (Metabase):
  metabase:
    healthcheck:
      test: wget -q --spider http://localhost:3000/api/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      mongo:
        condition: service_healthy
      clickhouse:
        condition: service_healthy
  # Time-series and realtime data visualization tool (Grafana):
  grafana:
    healthcheck:
      test: wget -q --spider http://localhost:3000/api/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Log aggregation system (Loki):
  loki:
    healthcheck:
      test: wget -q --spider http://localhost:3100/ready >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Monitoring system & time series database (Prometheus):
  prometheus:
    healthcheck:
      test: wget -q --spider http://localhost:9090/-/healthy >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      mimir:
        condition: service_healthy
      minio:
        condition: service_healthy
      mongo-metrics:
        condition: service_healthy
      postgres-metrics:
        condition: service_healthy
      rabbitmq-metrics:
        condition: service_healthy
  # Metrics backend (Mimir):
  mimir:
    healthcheck:
      test: wget -q --spider http://localhost:9009/ready >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Distributed tracing backend (Tempo):
  tempo:
    healthcheck:
      test: wget -q --spider http://localhost:9411/ready >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # API auto-documentation (Swagger):
  swagger:
    healthcheck:
      test: wget -q --spider http://localhost:80/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Remote terminal app (xterm.js):
  xterm:
    healthcheck:
      test: wget -q --spider http://localhost:80/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Service management (Consul):
  consul:
    healthcheck:
      test: wget -q --spider http://localhost:8500/ >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Service management configuration import (Consul):
  consul-kv-import:
    depends_on:
      consul:
        condition: service_healthy
  # Sensitive data storage (Vault):
  vault:
    healthcheck:
      test: wget -q --spider http://localhost:8200/v1/sys/health >/dev/null || exit 1
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s
  # Service discovery registration (Registrator):
  registrator:
    depends_on:
      consul:
        condition: service_healthy
  # Log collection service (Promtail): 
  promtail:
    depends_on:
      loki:
        condition: service_healthy
piontkovsk11andre1 commented 1 year ago
I've added restart: always to all containers.
piontkovsk11andre1 commented 1 year ago
Containers that are meant to run only once at the start will use restart: unless-stopped.
piontkovsk11andre1 commented 1 year ago
Healthchecks have been updated.
vendgine / directory

Replace docker's healthchecks with restart #48