ckan / ideas

[DEPRECATED] Use the main CKAN repo Discussions instead:
https://github.com/ckan/ckan/discussions
40 stars 2 forks source link

Smart caching of datastore dumps #188

Open jqnatividad opened 7 years ago

jqnatividad commented 7 years ago

With the new datastore dump formats (https://github.com/ckan/ckan/pull/3390), and even the regular CSV dump, the files are created dynamically, which is an expensive operation, with large tables timing out with errors.

It'd be great if the writers can implement some smart caching mechanism where the files are stored in the file system (or the cloud, if using ckanext-cloudstorage) and the cached files are automatically expired when the datastore is updated.

jqnatividad commented 7 years ago

Was able to implement this through Nginx:

Modifying the standard nginx.conf http://docs.ckan.org/en/latest/maintaining/installing/deployment.html#create-the-nginx-config-file

proxy_cache_path /tmp/nginx_cache levels=1:2 keys_zone=cache:30m max_size=250m;
proxy_temp_path /tmp/nginx_proxy 1 2;

server {
    client_max_body_size 100M;
    location / {
        proxy_pass http://127.0.0.1:8080/;
        proxy_set_header X-Forwarded-For $remote_addr;
        proxy_set_header Host $host;
        proxy_cache cache;
        proxy_cache_bypass $cookie_auth_tkt;
        proxy_no_cache $cookie_auth_tkt;
        proxy_cache_valid 30m;
        proxy_cache_key $host$scheme$proxy_host$request_uri;
        # In emergency comment out line to force caching
        # proxy_ignore_headers X-Accel-Expires Expires Cache-Control;
    }

}

to

proxy_cache_path /tmp/nginx_cache levels=1:2 keys_zone=cache:30m max_size=250m;
proxy_temp_path /tmp/nginx_proxy 1 2;

# datastore dump proxy cache
proxy_cache_path /tmp/datastoredump_cache levels=1 keys_zone=dumpcache:30m max_size=5g inactive=24h;

server {
    client_max_body_size 100M;
    location / {
        proxy_pass http://127.0.0.1:8080/;
        proxy_set_header X-Forwarded-For $remote_addr;
        proxy_set_header Host $host;
        proxy_cache cache;
        proxy_cache_bypass $cookie_auth_tkt;
        proxy_no_cache $cookie_auth_tkt;
        proxy_cache_valid 30m;
        proxy_cache_key $host$scheme$proxy_host$request_uri;
        # In emergency comment out line to force caching
        # proxy_ignore_headers X-Accel-Expires Expires Cache-Control;
    }

    # datastore dump
    location /datastore/dump {
        add_header Cache-Control "no-cache, must-revalidate, max-age=0";

        gzip_comp_level 2;
        gzip_proxied any;
        gzip_types *;
        proxy_cache dumpcache;
        proxy_cache_use_stale updating;
        proxy_cache_lock on;
        proxy_cache_valid 200 24h;
        proxy_ignore_headers X-Accel-Expires Expires Cache-Control;
        proxy_connect_timeout       600;
        proxy_send_timeout          600;
        proxy_read_timeout          600;
        send_timeout                600;
        client_max_body_size          0;

        proxy_pass http://127.0.0.1:8080/datastore/dump;
    }
}

this creates a 24 hour nginx proxy cache with a max size of 5g, allowing a datastore dump up to 10 minutes to create the dump file.