mjanez / ckan-docker

Custom CKAN Docker Compose Deployment
https://mjanez.github.io/ckan-docs
0 stars 4 forks source link

Load data into datastore #10

Closed mjanez closed 7 months ago

mjanez commented 1 year ago

Once the use of the datapusher is deprecated (7db1611), several alternatives for loading structured data into the CKAN database (datastore) are proposed:

  1. ckanext-xloader (in background ckan-xloader container)

    • [ ] Fix xloader API Token update in CKAN_INI. https://github.com/mjanez/ckan-docker/blob/38930eb029ea551cbdfa90657e6c62ec30055e34/ckan/docker-entrypoint.d/setup_xloader.sh#L1-L15

    • [ ] Update ckan/setup/supervisord to include xloader worker in the background.

      supervisor.conf

      [unix_http_server]
      file = /tmp/supervisor.sock
      chmod = 0777
      chown = nobody:nogroup
      
      [supervisord]
      logfile = /tmp/supervisord.log
      logfile_maxbytes = 50MB
      logfile_backups=10
      loglevel = info
      pidfile = /tmp/supervisord.pid
      nodaemon = true
      umask = 022
      identifier = supervisor
      
      [supervisorctl]
      serverurl = unix:///tmp/supervisor.sock
      
      [rpcinterface:supervisor]
      supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
      
      [include]
      files = /etc/supervisord.d/*.conf

      supervisor.worker.conf

      [program:ckan-worker]
      command=ckan -c /srv/app/ckan.ini jobs worker
      priority=501
      autostart=true
      autorestart=true
      redirect_stderr=true
      stdout_logfile=/dev/stdout
      stdout_logfile_maxbytes=0
      stderr_logfile=/dev/stdout
      stderr_logfile_maxbytes=0
      user=ckan
      environment=HOME="/srv/app",USER="ckan"
    • [ ] Update setup/prerun.py

        import os
        import sys
        import subprocess
        import psycopg2
        try:
            from urllib.request import urlopen
            from urllib.error import URLError
        except ImportError:
            from urllib2 import urlopen
            from urllib2 import URLError
      
        import time
        import re
        import json
      
        ckan_ini = os.environ.get("CKAN_INI", "/srv/app/ckan.ini")
      
        RETRY = 5
      
        def update_plugins():
      
            plugins = os.environ.get("XLOADER__PLUGINS", "")
            print(("[prerun] Setting the following plugins in {}:".format(ckan_ini)))
            print(plugins)
            cmd = ["ckan", "config-tool", ckan_ini, "ckan.plugins = {}".format(plugins)]
            subprocess.check_output(cmd, stderr=subprocess.STDOUT)
            print("[prerun] Plugins set.")
      
        def check_main_db_connection(retry=None):
      
            conn_str = os.environ.get("CKAN_SQLALCHEMY_URL")
            if not conn_str:
                print("[prerun] CKAN_SQLALCHEMY_URL not defined, not checking db")
            return check_db_connection(conn_str, retry)
      
        def check_datastore_db_connection(retry=None):
      
            conn_str = os.environ.get("CKAN_DATASTORE_WRITE_URL")
            if not conn_str:
                print("[prerun] CKAN_DATASTORE_WRITE_URL not defined, not checking db")
            return check_db_connection(conn_str, retry)
      
        def check_db_connection(conn_str, retry=None):
      
            if retry is None:
                retry = RETRY
            elif retry == 0:
                print("[prerun] Giving up after 5 tries...")
                sys.exit(1)
      
            try:
                connection = psycopg2.connect(conn_str)
      
            except psycopg2.Error as e:
                print(str(e))
                print("[prerun] Unable to connect to the database, waiting...")
                time.sleep(10)
                check_db_connection(conn_str, retry=retry - 1)
            else:
                connection.close()
      
        def check_solr_connection(retry=None):
      
            if retry is None:
                retry = RETRY
            elif retry == 0:
                print("[prerun] Giving up after 5 tries...")
                sys.exit(1)
      
            url = os.environ.get("CKAN_SOLR_URL", "")
            search_url = '{url}/schema/name?wt=json'.format(url=url)
      
            try:
                connection = urlopen(search_url)
            except URLError as e:
                print(str(e))
                print("[prerun] Unable to connect to solr, waiting...")
                time.sleep(10)
                check_solr_connection(retry=retry - 1)
            else:
                import re                                                                                                                                                      
                conn_info = connection.read()                                                                                                                                  
                schema_name = json.loads(conn_info)                                                                                                                            
                if 'ckan' in schema_name['name']:                                                                                                                              
                    print('[prerun] Succesfully connected to solr and CKAN schema loaded')                                                                                     
                else:                                                                                                                                                          
                    print('[prerun] Succesfully connected to solr, but CKAN schema not found')
      
        def init_db():
      
            db_command = ["ckan", "-c", ckan_ini, "db", "init"]
            print("[prerun] Initializing or upgrading db - start")
            try:
                subprocess.check_output(db_command, stderr=subprocess.STDOUT)
                print("[prerun] Initializing or upgrading db - end")
            except subprocess.CalledProcessError as e:
                if "OperationalError" in e.output:
                    print(e.output)
                    print("[prerun] Database not ready, waiting a bit before exit...")
                    time.sleep(5)
                    sys.exit(1)
                else:
                    print(e.output)
                    raise e
      
        def init_datastore_db():
      
            conn_str = os.environ.get("CKAN_DATASTORE_WRITE_URL")
            if not conn_str:
                print("[prerun] Skipping datastore initialization")
                return
      
            datastore_perms_command = ["ckan", "-c", ckan_ini, "datastore", "set-permissions"]
      
            connection = psycopg2.connect(conn_str)
            cursor = connection.cursor()
      
            print("[prerun] Initializing datastore db - start")
            try:
                datastore_perms = subprocess.Popen(
                    datastore_perms_command, stdout=subprocess.PIPE
                )
      
                perms_sql = datastore_perms.stdout.read()
                # Remove internal pg command as psycopg2 does not like it
                perms_sql = re.sub(b'\\\\connect "(.*)"', b"", perms_sql)
                cursor.execute(perms_sql)
                for notice in connection.notices:
                    print(notice)
      
                connection.commit()
      
                print("[prerun] Initializing datastore db - end")
                print(datastore_perms.stdout.read())
            except psycopg2.Error as e:
                print("[prerun] Could not initialize datastore")
                print(str(e))
      
            except subprocess.CalledProcessError as e:
                if "OperationalError" in e.output:
                    print(e.output)
                    print("[prerun] Database not ready, waiting a bit before exit...")
                    time.sleep(5)
                    sys.exit(1)
                else:
                    print(e.output)
                    raise e
            finally:
                cursor.close()
                connection.close()
      
        def create_sysadmin():
      
            name = os.environ.get("CKAN_SYSADMIN_NAME")
            password = os.environ.get("CKAN_SYSADMIN_PASSWORD")
            email = os.environ.get("CKAN_SYSADMIN_EMAIL")
      
            if name and password and email:
      
                # Check if user exists
                command = ["ckan", "-c", ckan_ini, "user", "show", name]
      
                out = subprocess.check_output(command)
                if b"User:None" not in re.sub(b"\s", b"", out):
                    print("[prerun] Sysadmin user exists, skipping creation")
                    return
      
                # Create user
                command = [
                    "ckan",
                    "-c",
                    ckan_ini,
                    "user",
                    "add",
                    name,
                    "password=" + password,
                    "email=" + email,
                ]
      
                subprocess.call(command)
                print("[prerun] Created user {0}".format(name))
      
                # Make it sysadmin
                command = ["ckan", "-c", ckan_ini, "sysadmin", "add", name]
      
                subprocess.call(command)
                print("[prerun] Made user {0} a sysadmin".format(name))
      
                # cleanup permissions
                # We're running as root before pivoting to uwsgi and dropping privs
                data_dir = "%s/storage" % os.environ['CKAN_STORAGE_PATH']
      
                command = ["chown", "-R", "ckan:ckan", data_dir]
                subprocess.call(command)
                print("[prerun] Ensured storage directory is owned by ckan")
      
        if __name__ == "__main__":
      
            maintenance = os.environ.get("MAINTENANCE_MODE", "").lower() == "true"
      
            if maintenance:
                print("[prerun] Maintenance mode, skipping setup...")
            else:
                check_main_db_connection()
                init_db()
                update_plugins()
                check_datastore_db_connection()
                init_datastore_db()
                check_solr_connection()
                create_sysadmin()
    • [ ] Update setup/start_ckan.sh

        #!/bin/sh
      
        # Add ckan.datapusher.api_token to the CKAN config file (updated with corrected value later)
        ckan config-tool $CKAN_INI ckan.datapusher.api_token=xxx
      
        # Add ckan.xloader.api_token to the CKAN config file (updated with corrected value later)
        ckan config-tool $CKAN_INI ckan.xloader.api_token=xxx
      
        # Set up the Secret key used by Beaker and Flask
        # This can be overriden using a CKAN___BEAKER__SESSION__SECRET env var
        if grep -E "beaker.session.secret ?= ?$" ckan.ini
        then
            echo "Setting beaker.session.secret in ini file"
            ckan config-tool $CKAN_INI "beaker.session.secret=$(python3 -c 'import secrets; print(secrets.token_urlsafe())')"
            ckan config-tool $CKAN_INI "WTF_CSRF_SECRET_KEY=$(python3 -c 'import secrets; print(secrets.token_urlsafe())')"
            JWT_SECRET=$(python3 -c 'import secrets; print("string:" + secrets.token_urlsafe())')
            ckan config-tool $CKAN_INI "api_token.jwt.encode.secret=${JWT_SECRET}"
            ckan config-tool $CKAN_INI "api_token.jwt.decode.secret=${JWT_SECRET}"
        fi
      
        # Run the prerun script to init CKAN and create the default admin user
        python3 prerun.py
      
        echo "Set up ckan.datapusher.api_token in the CKAN config file"
        ckan config-tool $CKAN_INI "ckan.datapusher.api_token=$(ckan -c $CKAN_INI user token add ckan_admin datapusher | tail -n 1 | tr -d '\t')"
      
        echo "Set up ckan.xloader.api_token in the CKAN config file"
        ckan config-tool $CKAN_INI "ckan.xloader.api_token=$(ckan -c $CKAN_INI user token add ckan_admin xloader | tail -n 1 | tr -d '\t')"
      
        echo "Set up ckanext.xloader.jobs_db.uri in the CKAN config file"
        ckan config-tool $CKAN_INI "ckanext.xloader.jobs_db.uri=${CKAN_SQLALCHEMY_URL}"
      
        # Run any startup scripts provided by images extending this one
        if [[ -d "/docker-entrypoint.d" ]]
        then
            for f in /docker-entrypoint.d/*; do
                case "$f" in
                    *.sh)     echo "$0: Running init file $f"; . "$f" ;;
                    *.py)     echo "$0: Running init file $f"; python3 "$f"; echo ;;
                    *)        echo "$0: Ignoring $f (not an sh or py file)" ;;
                esac
                echo
            done
        fi
      
        # Set the common uwsgi options
        UWSGI_OPTS="--plugins http,python \
                    --socket /tmp/uwsgi.sock \
                    --wsgi-file /srv/app/wsgi.py \
                    --module wsgi:application \
                    --uid 92 --gid 92 \
                    --http 0.0.0.0:5000 \
                    --master --enable-threads \
                    --lazy-apps \
                    -p 2 -L -b 32768 --vacuum \
                    --harakiri $UWSGI_HARAKIRI"
      
        if [ $? -eq 0 ]
        then
            # Start supervisord
            supervisord --configuration /etc/supervisord.conf &
            # Start uwsgi
            uwsgi $UWSGI_OPTS
        else
          echo "[prerun] failed...not starting CKAN with XLoader."
        fi
  2. Standalone container (aircan) with ckanext-aircan. [Preferred]

mjanez commented 1 year ago

Note Info about xloader into separate server.

Adapt: ckan/ckan-base-xloader to:

mjanez/ckan-docker-spatial base Dockerfile CKAN 2.9.9

COMMAND SIZE COMMENT
ENV CKAN_INI=/srv/app/ckan.ini 0B buildkit.dockerfile.v0
ENV PIP_SRC=/srv/app/src 0B buildkit.dockerfile.v0
ENV CKAN_STORAGE_PATH=/var/lib/ckan 0B buildkit.dockerfile.v0
ENV CKAN_GIT_URL=https://github.com/ckan/ckan.git 0B buildkit.dockerfile.v0
ENV XLOADER_GIT_URL=https://github.com/ckan/ckanext-xloader.git 0B buildkit.dockerfile.v0
ENV CKAN_GIT_BRANCH=ckan-2.10.1 0B buildkit.dockerfile.v0
ENV XLOADER_GIT_BRANCH=1.0.1 0B buildkit.dockerfile.v0
ENV XLOADER__PLUGINS=image_view text_view recline_view datastore xloader envvars 0B buildkit.dockerfile.v0
ENV UWSGI_HARAKIRI=50 0B buildkit.dockerfile.v0
WORKDIR /srv/app 0B buildkit.dockerfile.v0
RUN /bin/sh -c apk add --no-cache tzdata git gettext postgresql-client python3 libxml2 libxslt musl-dev uwsgi uwsgi-http uwsgi-corerouter uwsgi-python py3-gevent uwsgi-gevent libmagic curl patch bash && apk add --no-cache --virtual .build-deps postgresql-dev gcc make g++ autoconf automake libtool python3-dev libxml2-dev libxslt-dev linux-headers openssl-dev libffi-dev cargo && mkdir -p ${SRC_DIR} && curl -o ${SRC_DIR}/get-pip.py https://bootstrap.pypa.io/get-pip.py && python3 ${SRC_DIR}/get-pip.py && pip3 install supervisor && mkdir /etc/supervisord.d && rm -rf ${SRC_DIR}/get-pip.py # buildkit 1.65GB buildkit.dockerfile.v0
RUN /bin/sh -c pip3 install -e git+${CKAN_GIT_URL}@${CKAN_GIT_BRANCH}#egg=ckan && cd ${SRC_DIR}/ckan && cp who.ini ${APP_DIR} && pip3 install --no-binary markdown -r requirements.txt && pip3 install -e git+https://github.com/okfn/ckanext-envvars.git#egg=ckanext-envvars && ckan generate config ${CKAN_INI} && ckan config-tool ${CKAN_INI} "beaker.session.secret = " && ckan config-tool ${CKAN_INI} "ckan.plugins = ${XLOADER__PLUGINS}" # buildkit 234MB buildkit.dockerfile.v0
RUN /bin/sh -c addgroup -g 92 -S ckan && adduser -u 92 -h /home/ckan -s /bin/bash -D -G ckan ckan # buildkit 5.09kB buildkit.dockerfile.v0
RUN /bin/sh -c mkdir -p ${CKAN_STORAGE_PATH} && chown -R ckan:ckan ${CKAN_STORAGE_PATH} # buildkit 0B buildkit.dockerfile.v0
RUN /bin/sh -c mkdir -p ${APP_DIR}/src && git clone -b ${XLOADER_VERSION} --depth=1 --single-branch ${XLOADER_GIT_URL} && cd ckanext-xloader && python3 setup.py install && pip3 install -r requirements.txt && pip3 install -U requests[security] # buildkit 104MB buildkit.dockerfile.v0
COPY setup/supervisord.conf /etc # buildkit 480B buildkit.dockerfile.v0
COPY setup/prerun.py /srv/app # buildkit 7.07kB buildkit.dockerfile.v0
COPY setup/start_ckan.sh /srv/app # buildkit 2.67kB buildkit.dockerfile.v0
COPY setup/supervisor.worker.conf /etc/supervisord.d/supervisor.worker.conf # buildkit 289B buildkit.dockerfile.v0
ADD https://raw.githubusercontent.com/ckan/ckan/ckan-2.10.1/wsgi.py /srv/app # buildkit 582B buildkit.dockerfile.v0
RUN /bin/sh -c chmod 644 ${APP_DIR}/wsgi.py # buildkit 582B buildkit.dockerfile.v0
HEALTHCHECK &{["CMD-SHELL" "curl --fail http://localhost:8800/api/3/action/status_show exit CMD [\"/srv/app/start_ckan.sh\"]"] "1m0s" "5s" "0s" '\x05'}
CMD ["/srv/app/start_ckan.sh

Info: https://github.com/ckan/ckan-docker/wiki/Replacing-DataPusher-with-XLoader