checkpoint-restore / criu

Checkpoint/Restore tool
criu.org
Other
2.76k stars 559 forks source link

After dumping the unix socket, restore error occurred #2425

Closed Idealist226 closed 4 days ago

Idealist226 commented 6 days ago

Description

Hi everyone! I'm new to CRIU and I apologize if this is a basic question, but I'm having a hard time solving this issue. Any help would be greatly appreciated.

In order to test CRIU's support for Unix sockets, I wrote two programs, client.c and server.c. The client will send a request to the server. After receiving the request, the server will sleep for 5 seconds and then reply to the client, and then the process will be repeated continuously.Their contents are as follows:

// server.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>

#define SOCKET_PATH "/tmp/unix_dgram_socket_example"
#define my_printf(...) printf(__VA_ARGS__); fflush(stdout)

void handle_client(int server_socket) {
    char buffer[256];
    int n;
    struct sockaddr_un client_addr;
    socklen_t client_len = sizeof(client_addr);

    while (1) {
        memset(&client_addr, 0, sizeof(client_addr));
        n = recvfrom(server_socket, buffer, sizeof(buffer) - 1, 0, (struct sockaddr *)&client_addr, &client_len);
        if (n < 0) {
            perror("recvfrom");
            continue;
        }

        buffer[n] = '\0';
        my_printf("Received: %s\n", buffer);

        int i = 0;
        while (i++ < 5) {
            my_printf("i = %d\n", i);
            sleep(1);
        }

        if (sendto(server_socket, buffer, n, 0, (struct sockaddr *)&client_addr, client_len) < 0) {
            perror("sendto");
        }
    }
}

int main() {
    int server_socket;
    struct sockaddr_un server_addr;

    server_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
    if (server_socket < 0) {
        perror("socket");
        exit(EXIT_FAILURE);
    }

    unlink(SOCKET_PATH);

    memset(&server_addr, 0, sizeof(server_addr));
    server_addr.sun_family = AF_UNIX;
    strncpy(server_addr.sun_path, SOCKET_PATH, sizeof(server_addr.sun_path) - 1);

    if (bind(server_socket, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) {
        perror("bind");
        close(server_socket);
        exit(EXIT_FAILURE);
    }

    my_printf("Server is listening on %s...\n", SOCKET_PATH);

    handle_client(server_socket);

    close(server_socket);
    unlink(SOCKET_PATH);
    return 0;
}
// client.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>

#define SOCKET_PATH "/tmp/unix_dgram_socket_example"
#define my_printf(...) printf(__VA_ARGS__); fflush(stdout)

int main() {
    int client_socket;
    struct sockaddr_un server_addr, client_addr;
    char buffer[256];
    int message_count = 0;

    client_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
    if (client_socket < 0) {
        perror("socket");
        exit(EXIT_FAILURE);
    }

    memset(&client_addr, 0, sizeof(client_addr));
    client_addr.sun_family = AF_UNIX;
    snprintf(client_addr.sun_path, sizeof(client_addr.sun_path), "/tmp/unix_dgram_client_socket_example");
    unlink(client_addr.sun_path);

    if (bind(client_socket, (struct sockaddr *)&client_addr, sizeof(client_addr)) < 0) {
        perror("bind");
        close(client_socket);
        exit(EXIT_FAILURE);
    }

    memset(&server_addr, 0, sizeof(server_addr));
    server_addr.sun_family = AF_UNIX;
    strncpy(server_addr.sun_path, SOCKET_PATH, sizeof(server_addr.sun_path) - 1);

    while (1) {
        snprintf(buffer, sizeof(buffer), "Hello, server %d", message_count);
        my_printf("going to send: %s\n", buffer);
        message_count++;

        if (sendto(client_socket, buffer, strlen(buffer), 0, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) {
            perror("sendto");
            close(client_socket);
            exit(EXIT_FAILURE);
        }

        int n = recvfrom(client_socket, buffer, sizeof(buffer) - 1, 0, NULL, NULL);
        if (n < 0) {
            perror("recvfrom");
            close(client_socket);
            exit(EXIT_FAILURE);
        }

        buffer[n] = '\0';
        my_printf("Echo from server: %s\n", buffer);
    }

    close(client_socket);
    unlink(client_addr.sun_path);
    my_printf("Disconnected from server.\n");
    return 0;
}

I compiled them using the following commands:

gcc client.c -o client1
gcc server.c -o server1

Steps to reproduce the issue:

  1. Run the server and client

    setsid ./server1 < /dev/null &> ./log/server.log &
    setsid ./client1  < /dev/null &> ./log/client.log &
  2. Dump client1

    #!/bin/bash
    
    # get client1's PID
    CLIENT_PID=$(pgrep client1)
    if [ -z "$CLIENT_PID" ]; then
       echo "client1 process not found"
       exit 1
    fi
    
    CLIENT_SOCKET="/tmp/unix_dgram_client_socket_example"
    if [ -z "$CLIENT_SOCKET" ]; then
       echo "No UNIX domain sockets found for client1"
       exit 1
    fi
    
    EXTERNAL_ARGS=""
    INODE=$(ls -i $CLIENT_SOCKET | awk '{print $1}')
    EXTERNAL_ARGS="$EXTERNAL_ARGS --ext-unix-sk --external unix[$INODE]"
    echo $EXTERNAL_ARGS
    
    # Execute the CRIU dump operation, including the obtained inode number as an external resource
    criu dump -D images -t "$CLIENT_PID" -v4 -o dump.log $EXTERNAL_ARGS
    
    echo "CRIU dump completed."
  3. Restore client1

    # 3016924 is the INODE number of "/tmp/unix_dgram_client_socket_example" obtained in the previous step
    criu restore -d -D images -v4 -o restore.log --ext-unix-sk --inherit-fd fd[2]:socket:[3016924]

Describe the results you received: After dumping client1, server.log prints "sendto: Connection refused". After executing the restore command, the client did not continue to send requests to the server, and the server did not respond normally.

Describe the results you expected: It is expected that after executing the restore command, the client can continue to send requests to the server and the server can respond normally.

CRIU logs and information:

CRIU full dump/restore logs:

``` // restore.log (00.000124) Version: 3.13 (gitid v3.13-1-g1d88f8b85) (00.000194) Running on cu04-virt01 Linux 5.10.0+ #5 SMP Thu Jan 11 17:46:12 CST 2024 x86_64 (00.000253) Loaded kdat cache from /run/criu.kdat (00.000266) File socket:[3016925] will be restored from inherit fd 2 (00.000293) rlimit: RLIMIT_NOFILE unlimited for self (00.000548) cpu: x86_family 6 x86_vendor_id GenuineIntel x86_model_id Intel Core Processor (Broadwell, IBRS) (00.000587) cpu: fpu: xfeatures_mask 0x5 xsave_size 832 xsave_size_max 832 xsaves_size 0 (00.000602) cpu: fpu: x87 floating point registers xstate_offsets 0 / 0 xstate_sizes 160 / 160 (00.000613) cpu: fpu: AVX registers xstate_offsets 576 / 576 xstate_sizes 256 / 256 (00.000623) cpu: fpu:1 fxsr:1 xsave:1 xsaveopt:1 xsavec:0 xgetbv1:0 xsaves:0 (00.000707) kernel pid_max=32768 (00.000722) Reading image tree (00.000805) Add mnt ns 5 pid 27572 (00.000825) Add net ns 2 pid 27572 (00.000840) pstree pid_max=27572 (00.000855) Will restore in 0 namespaces (00.000895) NS mask to use 0 (00.001009) Collecting 37/54 (flags 2) (00.001080) Collected [router/test/criu/client1] ID 0x1 (00.001099) Collected [lib/x86_64-linux-gnu/libc-2.27.so] ID 0x2 (00.001109) Collected [lib/x86_64-linux-gnu/ld-2.27.so] ID 0x3 (00.001123) Collected [dev/null] ID 0x4 (00.001133) Collected [router/test/criu/log/client.log] ID 0x5 (00.001158) unix: `- Got id 0x6 ino 2195531 type SOCK_DGRAM state TCP_CLOSE peer 0 (name /tmp/unix_dgram_client_socket_example dir -) (00.001200) Collected [root/.vscode-server/data/logs/20240626T112403/ptyhost.log] ID 0x7 (00.001219) Collected [root/.vscode-server/data/logs/20240626T112403/remoteagent.log] ID 0x8 (00.001228) Collected [root/.vscode-server/data/logs/20240626T112403/network.log] ID 0x9 (00.001238) Collected [root/.vscode-server/bin/8b3775030ed1a69b13e4f4c628c612102e30a681/vscode-remote-lock.root.8b3775030ed1a69b13e4f4c628c612102e30a681] ID 0xa (00.001247) Collected [router/test/criu] ID 0xb (00.001257) Collected [.] ID 0xc (00.001274) `- ... done (00.001282) Collecting 43/59 (flags 0) (00.001299) No remap-fpath.img image (00.001308) `- ... done (00.001322) No cgroup.img image (00.001340) Running pre-restore scripts (00.001444) No mountpoints-5.img image (00.001471) mnt: Reading mountpoint images (id 5 pid 27572) (00.001499) No netns-2.img image (00.001680) Forking task with 27572 pid (flags 0x0) (00.002039) PID: real 27572 virt 27572 (00.002326) 27572: cg: Cgroups 1 inherited from parent (00.002406) 27572: Calling restore_sid() for init (00.002419) 27572: Restoring 27572 to 27572 sid (00.002553) 27572: Collecting 41/37 (flags 2) (00.002614) 27572: No tty-info.img image (00.002639) 27572: `- ... done (00.002648) 27572: Collecting 42/51 (flags 0) (00.002661) 27572: No tty-data.img image (00.002690) 27572: `- ... done (00.002700) 27572: Restoring namespaces 27572 flags 0x0 (00.002731) 27572: Preparing info about shared resources (00.002830) 27572: Collecting 45/38 (flags 0) (00.002847) 27572: No filelocks.img image (00.002854) 27572: `- ... done (00.002861) 27572: Collecting 39/27 (flags 0) (00.002872) 27572: No pipes-data.img image (00.002879) 27572: `- ... done (00.002885) 27572: Collecting 40/27 (flags 0) (00.002896) 27572: No fifo-data.img image (00.002902) 27572: `- ... done (00.002909) 27572: Collecting 38/60 (flags 0) (00.002919) 27572: No sk-queues.img image (00.002926) 27572: `- ... done (00.003014) 27572: Found 18 VMAs in image (00.003051) 27572: vma 0x562ad6200000 0x562ad6201000 (00.003060) 27572: vma 0x562ad6401000 0x562ad6402000 (00.003068) 27572: vma 0x562ad6402000 0x562ad6403000 (00.003075) 27572: vma 0x562ad783d000 0x562ad785e000 (00.003082) 27572: vma 0x7f776710e000 0x7f77672f5000 (00.003088) 27572: vma 0x7f77672f5000 0x7f77674f5000 (00.003095) 27572: vma 0x7f77674f5000 0x7f77674f9000 (00.003101) 27572: vma 0x7f77674f9000 0x7f77674fb000 (00.003108) 27572: vma 0x7f77674fb000 0x7f77674ff000 (00.003115) 27572: vma 0x7f77674ff000 0x7f7767528000 (00.003121) 27572: vma 0x7f776770f000 0x7f7767711000 (00.003151) 27572: vma 0x7f7767728000 0x7f7767729000 (00.003158) 27572: vma 0x7f7767729000 0x7f776772a000 (00.003165) 27572: vma 0x7f776772a000 0x7f776772b000 (00.003172) 27572: vma 0x7ffc71b36000 0x7ffc71b57000 (00.003178) 27572: vma 0x7ffc71b6b000 0x7ffc71b6f000 (00.003185) 27572: vma 0x7ffc71b6f000 0x7ffc71b71000 (00.003191) 27572: vma 0xffffffffff600000 0xffffffffff601000 (00.003217) 27572: Collect fdinfo pid=27572 fd=0 id=0x4 (00.003228) 27572: Collect fdinfo pid=27572 fd=1 id=0x5 (00.003236) 27572: Collect fdinfo pid=27572 fd=2 id=0x5 (00.003243) 27572: Collect fdinfo pid=27572 fd=3 id=0x6 (00.003251) 27572: Collect fdinfo pid=27572 fd=19 id=0x7 (00.003258) 27572: Collect fdinfo pid=27572 fd=22 id=0x8 (00.003265) 27572: Collect fdinfo pid=27572 fd=27 id=0x9 (00.003272) 27572: Collect fdinfo pid=27572 fd=99 id=0xa (00.003465) 27572: skqueue: Preparing SCMs (00.003590) 27572: unix: Unlinked socket 2195531 peer 0 (name /tmp/unix_dgram_client_socket_example dir -) (00.003611) 27572: unix: ghost: Resolving addresses (00.003622) 27572: unix: `- will add fake 100 fd (00.003630) 27572: File descs: (00.003636) 27572: `- type 1 ID 0x1 (00.003643) 27572: `- type 1 ID 0x2 (00.003649) 27572: `- type 1 ID 0x3 (00.003655) 27572: `- type 1 ID 0x4 (00.003662) 27572: `- FD 0 pid 27572 (00.003668) 27572: `- type 1 ID 0x5 (00.003675) 27572: `- FD 1 pid 27572 (00.003681) 27572: `- FD 2 pid 27572 (00.003688) 27572: `- type 5 ID 0x6 (00.003694) 27572: `- FD 3 pid 27572 (00.003700) 27572: `- type 1 ID 0x7 (00.003707) 27572: `- FD 19 pid 27572 (00.003713) 27572: `- type 1 ID 0x8 (00.003719) 27572: `- FD 22 pid 27572 (00.003726) 27572: `- type 1 ID 0x9 (00.003732) 27572: `- FD 27 pid 27572 (00.003738) 27572: `- type 1 ID 0xa (00.003744) 27572: `- FD 99 pid 27572 (00.003751) 27572: `- type 1 ID 0xb (00.003757) 27572: `- type 1 ID 0xc (00.003763) 27572: `- type 5 ID 0xd (00.003769) 27572: `- FD 100 pid 27572 (00.004061) 27572: Opened local page read 1 (parent 0) (00.004089) 27572: Enqueue page-read (00.004098) 27572: Enqueue page-read (00.004104) 27572: Enqueue page-read (00.004111) 27572: Enqueue page-read (00.004117) 27572: Enqueue page-read (00.004123) 27572: Enqueue page-read (00.004129) 27572: Enqueue page-read (00.004136) 27572: Enqueue page-read (00.004142) 27572: Enqueue page-read (00.004148) 27572: Enqueue page-read (00.004154) 27572: Enqueue page-read (00.004160) 27572: Enqueue page-read (00.004166) 27572: Enqueue page-read (00.004172) 27572: Enqueue page-read (00.004185) 27572: nr_restored_pages: 24 (00.004192) 27572: nr_shared_pages: 0 (00.004198) 27572: nr_dropped_pages: 0 (00.004205) 27572: nr_lazy: 0 (00.004223) 27572: Shrunk premap area to 0x7fedcaa78000(0) (00.004232) 27572: Restore on-core sigactions for 27572 (00.004457) 27572: Restoring children in alien sessions: (00.004482) 27572: Restoring children in our session: (00.004502) 27572: Restoring 27572 to 27572 pgid (00.004526) 27572: Restoring resources (00.004553) 27572: Opening fdinfo-s (00.004592) 27572: Create fd for 0 (00.004614) 27572: Create fd for 1 (00.004621) 27572: Going to dup 1 into 2 (00.004630) 27572: Receive fd for 2 (00.004640) 27572: unix: Opening standalone (stage 0 id 0x6 ino 2195531 peer 0) (00.004676) 27572: Create fd for 100 (00.004691) 27572: unix: bind id 0x6 ino 2195531 addr /tmp/unix_dgram_client_socket_example (00.004801) 27572: Create fd for 3 (00.004832) 27572: Create fd for 19 (00.004853) 27572: Create fd for 22 (00.004874) 27572: Create fd for 27 (00.004896) 27572: Create fd for 99 (00.004904) 27572: unix: Opening standalone (stage 1 id 0xd ino 0 peer 0) (00.004913) 27572: skqueue: Trying to restore recv queue for 6 (00.004941) 27572: sockets: 100 restore sndbuf 212992 rcv buf 212992 (00.004953) 27572: sockets: restore priority 0 for socket (00.004962) 27572: sockets: restore rcvlowat 1 for socket (00.004971) 27572: sockets: restore mark 0 for socket (00.004984) 27572: unix: Opening standalone (stage 1 id 0x6 ino 2195531 peer 0) (00.005011) 27572: sockets: 3 restore sndbuf 212992 rcv buf 212992 (00.005023) 27572: sockets: restore priority 0 for socket (00.005031) 27572: sockets: restore rcvlowat 1 for socket (00.005039) 27572: sockets: restore mark 0 for socket (00.005063) 27572: Opening 0x00562ad6200000-0x00562ad6201000 0000000000000000 (41) vma (00.005083) 27572: Opening 0x00562ad6401000-0x00562ad6402000 0x00000000001000 (41) vma (00.005091) 27572: Opening 0x00562ad6402000-0x00562ad6403000 0x00000000002000 (41) vma (00.005098) 27572: Opening 0x007f776710e000-0x007f77672f5000 0000000000000000 (20000041) vma (00.005115) 27572: Opening 0x007f77672f5000-0x007f77674f5000 0x000000001e7000 (41) vma (00.005123) 27572: Opening 0x007f77674f5000-0x007f77674f9000 0x000000001e7000 (41) vma (00.005130) 27572: Opening 0x007f77674f9000-0x007f77674fb000 0x000000001eb000 (41) vma (00.005137) 27572: Opening 0x007f77674ff000-0x007f7767528000 0000000000000000 (20000041) vma (00.005151) 27572: Opening 0x007f7767728000-0x007f7767729000 0x00000000029000 (41) vma (00.005158) 27572: Opening 0x007f7767729000-0x007f776772a000 0x0000000002a000 (41) vma (00.005288) 27572: `- render 9 iovs (0x562ad6200000:4096...) (00.005304) 27572: Restore via sigreturn (00.005403) 27572: Parsed 562d69600000-562d69730000 vma (00.005414) 27572: Parsed 562d6992f000-562d69950000 vma (00.005422) 27572: Parsed 562d6992f000-562d69956000 vma (00.005429) 27572: Parsed 562d6992f000-562d69967000 vma (00.005436) 27572: Parsed 562d6b303000-562d6b324000 vma (00.005443) 27572: Parsed 7fedcaee2000-7fedcb07f000 vma (00.005450) 27572: Parsed 7fedcaee2000-7fedcb27e000 vma (00.005458) 27572: Parsed 7fedcaee2000-7fedcb27f000 vma (00.005465) 27572: Parsed 7fedcaee2000-7fedcb280000 vma (00.005472) 27572: Parsed 7fedcaee2000-7fedcb29a000 vma (00.005479) 27572: Parsed 7fedcaee2000-7fedcb499000 vma (00.005485) 27572: Parsed 7fedcaee2000-7fedcb49a000 vma (00.005492) 27572: Parsed 7fedcaee2000-7fedcb49b000 vma (00.005499) 27572: Parsed 7fedcaee2000-7fedcb49f000 vma (00.005505) 27572: Parsed 7fedcaee2000-7fedcb4a6000 vma (00.005512) 27572: Parsed 7fedcaee2000-7fedcb6a5000 vma (00.005519) 27572: Parsed 7fedcaee2000-7fedcb6a6000 vma (00.005526) 27572: Parsed 7fedcaee2000-7fedcb6a7000 vma (00.005532) 27572: Parsed 7fedcaee2000-7fedcb88e000 vma (00.005539) 27572: Parsed 7fedcaee2000-7fedcba8e000 vma (00.005546) 27572: Parsed 7fedcaee2000-7fedcba92000 vma (00.005553) 27572: Parsed 7fedcaee2000-7fedcba94000 vma (00.005560) 27572: Parsed 7fedcaee2000-7fedcba98000 vma (00.005566) 27572: Parsed 7fedcaee2000-7fedcbaaf000 vma (00.005573) 27572: Parsed 7fedcaee2000-7fedcbcae000 vma (00.005580) 27572: Parsed 7fedcaee2000-7fedcbcaf000 vma (00.005587) 27572: Parsed 7fedcaee2000-7fedcbcb0000 vma (00.005593) 27572: Parsed 7fedcaee2000-7fedcbcb2000 vma (00.005600) 27572: Parsed 7fedcaee2000-7fedcbcd0000 vma (00.005607) 27572: Parsed 7fedcaee2000-7fedcbecf000 vma (00.005614) 27572: Parsed 7fedcaee2000-7fedcbed1000 vma (00.005621) 27572: Parsed 7fedcaee2000-7fedcbed2000 vma (00.005627) 27572: Parsed 7fedcaee2000-7fedcbed5000 vma (00.005634) 27572: Parsed 7fedcaee2000-7fedcc0d4000 vma (00.005641) 27572: Parsed 7fedcaee2000-7fedcc0d5000 vma (00.005647) 27572: Parsed 7fedcaee2000-7fedcc0d6000 vma (00.005654) 27572: Parsed 7fedcaee2000-7fedcc0de000 vma (00.005661) 27572: Parsed 7fedcaee2000-7fedcc2dd000 vma (00.005668) 27572: Parsed 7fedcaee2000-7fedcc2de000 vma (00.005695) 27572: Parsed 7fedcaee2000-7fedcc2df000 vma (00.005702) 27572: Parsed 7fedcaee2000-7fedcc2f2000 vma (00.005709) 27572: Parsed 7fedcaee2000-7fedcc4f1000 vma (00.005716) 27572: Parsed 7fedcaee2000-7fedcc4f2000 vma (00.005722) 27572: Parsed 7fedcaee2000-7fedcc4f3000 vma (00.005729) 27572: Parsed 7fedcaee2000-7fedcc4f4000 vma (00.005736) 27572: Parsed 7fedcaee2000-7fedcc51d000 vma (00.005743) 27572: Parsed 7fedcc6f6000-7fedcc700000 vma (00.005750) 27572: Parsed 7fedcc6f6000-7fedcc706000 vma (00.005757) 27572: Parsed 7fedcc6f6000-7fedcc709000 vma (00.005763) 27572: Parsed 7fedcc6f6000-7fedcc70b000 vma (00.005782) 27572: Parsed 7fedcc6f6000-7fedcc70d000 vma (00.005789) 27572: Parsed 7fedcc6f6000-7fedcc71d000 vma (00.005795) 27572: Parsed 7fedcc6f6000-7fedcc71e000 vma (00.005802) 27572: Parsed 7fedcc6f6000-7fedcc71f000 vma (00.005808) 27572: Parsed 7fedcc6f6000-7fedcc720000 vma (00.005815) 27572: Parsed 7ffc6a697000-7ffc6a6b8000 vma (00.005823) 27572: Parsed 7ffc6a6fc000-7ffc6a700000 vma (00.005829) 27572: Parsed 7ffc6a6fc000-7ffc6a702000 vma (00.005836) 27572: Parsed ffffffffff600000-ffffffffff601000 vma (00.005851) 27572: 1 threads require 100K of memory (00.005859) 27572: Found bootstrap VMA hint at: 0x10000 (needs ~124K) (00.005983) 27572: call mremap(0x7fedcc706000, 8192, 8192, MAYMOVE | FIXED, 0x25000) (00.006018) 27572: call mremap(0x7fedcc709000, 8192, 8192, MAYMOVE | FIXED, 0x27000) (00.006054) 27572: x86: xsave runtime structure (00.006065) 27572: x86: ----------------------- (00.006071) 27572: x86: cwd:0x37f swd:0 twd:0 fop:0 mxcsr:0x1f80 mxcsr_mask:0xffff (00.006079) 27572: x86: magic1:0x46505853 extended_size:836 xstate_bv:0x7 xstate_size:832 (00.006087) 27572: x86: xstate_bv: 0x7 (00.006115) 27572: x86: ----------------------- (00.006122) 27572: Thread 0 stack 0x1a080 rt_sigframe 0x22080 (00.006179) 27572: Going to chroot into /proc/self/fd/11 (00.006195) 27572: Restoring umask to 22 (00.006238) 27572: task_args: 0x24000 task_args->pid: 27572 task_args->nr_threads: 1 task_args->clone_restore_fn: 0x11c50 task_args->thread_args: 0x24540 (00.006255) pie: 27572: Switched to the restorer 27572 (00.006512) pie: 27572: Mapping native vDSO at 0x29000 (00.006564) pie: 27572: vdso: Using gettimeofday() on vdso at 0x2d8d0 (00.006591) pie: 27572: mmap(0x562ad6200000 -> 0x562ad6201000, 0x7 0x12 4) (00.006608) pie: 27572: mmap(0x562ad6401000 -> 0x562ad6402000, 0x3 0x12 4) (00.006620) pie: 27572: mmap(0x562ad6402000 -> 0x562ad6403000, 0x3 0x12 4) (00.006632) pie: 27572: mmap(0x562ad783d000 -> 0x562ad785e000, 0x3 0x32 -1) (00.006643) pie: 27572: mmap(0x7f776710e000 -> 0x7f77672f5000, 0x5 0x12 5) (00.006658) pie: 27572: mmap(0x7f77672f5000 -> 0x7f77674f5000, 0x2 0x12 5) (00.006672) pie: 27572: mmap(0x7f77674f5000 -> 0x7f77674f9000, 0x3 0x12 5) (00.006682) pie: 27572: mmap(0x7f77674f9000 -> 0x7f77674fb000, 0x3 0x12 5) (00.006694) pie: 27572: mmap(0x7f77674fb000 -> 0x7f77674ff000, 0x3 0x32 -1) (00.006704) pie: 27572: mmap(0x7f77674ff000 -> 0x7f7767528000, 0x5 0x12 6) (00.006719) pie: 27572: mmap(0x7f776770f000 -> 0x7f7767711000, 0x3 0x32 -1) (00.006729) pie: 27572: mmap(0x7f7767728000 -> 0x7f7767729000, 0x3 0x12 6) (00.006743) pie: 27572: mmap(0x7f7767729000 -> 0x7f776772a000, 0x3 0x12 6) (00.006754) pie: 27572: mmap(0x7f776772a000 -> 0x7f776772b000, 0x3 0x32 -1) (00.006764) pie: 27572: mmap(0x7ffc71b36000 -> 0x7ffc71b57000, 0x3 0x132 -1) (00.006775) pie: 27572: mmap(0x7ffc71b6b000 -> 0x7ffc71b6f000, 0x3 0x32 -1) (00.006785) pie: 27572: mmap(0x7ffc71b6f000 -> 0x7ffc71b71000, 0x7 0x32 -1) (00.006795) pie: 27572: Preadv 0x562ad6200000:4096... (9 iovs) (00.006988) pie: 27572: `- returned 98304 (00.007005) pie: 27572: `- skip pagemap (00.007012) pie: 27572: `- skip pagemap (00.007020) pie: 27572: `- skip pagemap (00.007026) pie: 27572: `- skip pagemap (00.007033) pie: 27572: `- skip pagemap (00.007040) pie: 27572: `- skip pagemap (00.007047) pie: 27572: `- skip pagemap (00.007053) pie: 27572: `- skip pagemap (00.007060) pie: 27572: `- skip pagemap (00.007071) pie: 27572: vdso: Parsing at 0x7ffc71b6f000 0x7ffc71b71000 (00.007079) pie: 27572: vdso: PT_LOAD p_vaddr: 0x0 (00.007087) pie: 27572: vdso: DT_HASH: 0x120 (00.007094) pie: 27572: vdso: DT_STRTAB: 0x2d8 (00.007101) pie: 27572: vdso: DT_SYMTAB: 0x1b8 (00.007108) pie: 27572: vdso: DT_STRSZ: 0x72 (00.007115) pie: 27572: vdso: DT_SYMENT: 0x18 (00.007122) pie: 27572: vdso: nbucket 0x3 nchain 0xc bucket 0x7ffc71b6f128 chain 0x7ffc71b6f134 (00.007132) pie: 27572: vdso: image [vdso] 0x7ffc71b6f000-0x7ffc71b71000 [vvar] 0x7ffc71b6b000-0x7ffc71b6f000 (00.007140) pie: 27572: vdso: Runtime vdso/vvar matches dumpee, remap inplace (00.007178) pie: 27572: vdso: Remap rt-vdso 0x2d000 -> 0x7ffc71b6f000 (00.007203) pie: 27572: vdso: Remap rt-vvar 0x29000 -> 0x7ffc71b6b000 (00.007226) pie: 27572: vdso: Using gettimeofday() on vdso at 0x7ffc71b6f8d0 (00.007305) pie: 27572: Restoring scheduler params 0.0.0 (00.007352) pie: 27572: 27572: Restored (00.007492) Running post-restore scripts (00.007528) Unlock network (00.007658) pie: 27572: seccomp: mode 0 on tid 27572 (00.007775) Force no-breakpoints restore (00.007802) Restore finished successfully. Resuming tasks. (00.007815) 27572 was trapped (00.007828) 27572 (native) is going to execute the syscall 202, required is 15 (00.007862) 27572 was trapped (00.007870) `- Expecting exit (00.007901) 27572 was trapped (00.007913) 27572 (native) is going to execute the syscall 3, required is 15 (00.007947) 27572 was trapped (00.007954) `- Expecting exit (00.007986) 27572 was trapped (00.007998) 27572 (native) is going to execute the syscall 3, required is 15 (00.008030) 27572 was trapped (00.008038) `- Expecting exit (00.008070) 27572 was trapped (00.008080) 27572 (native) is going to execute the syscall 11, required is 15 (00.008133) 27572 was trapped (00.008143) `- Expecting exit (00.008175) 27572 was trapped (00.008185) 27572 (native) is going to execute the syscall 15, required is 15 (00.008220) 27572 was stopped (00.008273) 27572 was trapped (00.008287) 27572 (native) is going to execute the syscall 11, required is 11 (00.008358) 27572 was stopped (00.008377) Running pre-resume scripts (00.008391) Writing stats (00.008480) Running post-resume scripts ```

Output of `criu --version`:

``` Version: 3.13 GitID: v3.13-1-g1d88f8b85 ```

Output of `criu check --all`:

``` Looks good. ```

adrianreber commented 6 days ago

First of all CRIU version 3.13 is really old and you should use the latest version for your tests.

After dumping client1, server.log prints "sendto: Connection refused".

To me that sounds like it is expected. After dumping client1 the process is gone and if your other process tries to communicate with the dumped process it will fail. So that makes totally sense to me. Does it not to you?

It is expected that after executing the restore command, the client can continue to send requests to the server and the server can respond normally.

After getting a "Connection refused" the socket will be closed. So this this seems to correct behaviour to me.

I would say you either need to ensure that no communication happens when only one of the processes is checkpointed or make sure that both process are checkpointed at the same time. To avoid situation like you are describing with TCP sockets CRIU adds firewall rules to ensure no traffic is being sent. The other way to handle this is to make your applications resilient to dropped connection and auto-reconnect.

Idealist226 commented 6 days ago

First of all CRIU version 3.13 is really old and you should use the latest version for your tests.

After dumping client1, server.log prints "sendto: Connection refused".

To me that sounds like it is expected. After dumping client1 the process is gone and if your other process tries to communicate with the dumped process it will fail. So that makes totally sense to me. Does it not to you?

It is expected that after executing the restore command, the client can continue to send requests to the server and the server can respond normally.

After getting a "Connection refused" the socket will be closed. So this this seems to correct behaviour to me.

I would say you either need to ensure that no communication happens when only one of the processes is checkpointed or make sure that both process are checkpointed at the same time. To avoid situation like you are describing with TCP sockets CRIU adds firewall rules to ensure no traffic is being sent. The other way to handle this is to make your applications resilient to dropped connection and auto-reconnect.

Thanks for your reply! , I still have some doubts.

After dumping client1 the process is gone and if your other process tries to communicate with the dumped process it will fail. So that makes totally sense to me. Does it not to you?

When a program uses a TCP socket for communication, we can use the --tcp-established option to dump TCP connections, and in this case, a "Connection refused" issue will not occur. However, why does this issue occur when dumping Unix Domain Sockets?

You mean, if I use Unix Domain Sockets, it's better to dump both ends of the communication simultaneously? This seems different from the semantics of dumping TCP connections, where I can dump only one end of the communication. Is the --external option during dump and the --inherit-fd option during restore intended to allow us to dump only one end of the communication?

Thank you again for your response!

adrianreber commented 6 days ago

When a program uses a TCP socket for communication, we can use the --tcp-established option to dump TCP connections, and in this case, a "Connection refused" issue will not occur.

This seems to be a misunderstanding on your side. If you have a TCP connection and one of the processes ends (because of criu dump) the connection will also be dropped. As mentioned above CRIU will install a firewall rule to drop all network packets to avoid this situation. Without the firewall rule you will see a RST packet once one side of the communication is gone and the other side continues the communication. --tcp-established cannot help you here. I am not aware of a possibility to install a "firewall" for unix sockets. It might be possible, I just have not heard of it.

You mean, if I use Unix Domain Sockets, it's better to dump both ends of the communication simultaneously?

Yes. Or your process can reconnect automatically if a connection is dropped.

This seems different from the semantics of dumping TCP connections, where I can dump only one end of the communication.

This is not true. It only works because of what I mentioned above.

Another way to solve this would be to quiesce both processes before checkpointing and restoring it. Have a signal handler on both sides which disables communication until another handler can enable it again. Checkpointing and restoring distributed applications like you want to do requires some kind of coordination. Either by quiescing communication or by synchronization.

Idealist226 commented 6 days ago

When a program uses a TCP socket for communication, we can use the --tcp-established option to dump TCP connections, and in this case, a "Connection refused" issue will not occur.

This seems to be a misunderstanding on your side. If you have a TCP connection and one of the processes ends (because of criu dump) the connection will also be dropped. As mentioned above CRIU will install a firewall rule to drop all network packets to avoid this situation. Without the firewall rule you will see a RST packet once one side of the communication is gone and the other side continues the communication. --tcp-established cannot help you here. I am not aware of a possibility to install a "firewall" for unix sockets. It might be possible, I just have not heard of it.

You mean, if I use Unix Domain Sockets, it's better to dump both ends of the communication simultaneously?

Yes. Or your process can reconnect automatically if a connection is dropped.

This seems different from the semantics of dumping TCP connections, where I can dump only one end of the communication.

This is not true. It only works because of what I mentioned above.

Another way to solve this would be to quiesce both processes before checkpointing and restoring it. Have a signal handler on both sides which disables communication until another handler can enable it again. Checkpointing and restoring distributed applications like you want to do requires some kind of coordination. Either by quiescing communication or by synchronization.

Thank you for your reply; it has cleared up many of my doubts!

I read the External_UNIX_socket blog post, which mentions the importance of the --external and --inherit-fd options for dumping and restoring Unix Domain Socket connections. However, based on your response, if we dump and restore both ends of the communication simultaneously, these options seem to be unnecessary. Is that correct?

avagin commented 5 days ago

I read the External_UNIX_socket blog post, which mentions the importance of the --external and --inherit-fd options for dumping and restoring Unix Domain Socket connections. However, based on your response, if we dump and restore both ends of the communication simultaneously, these options seem to be unnecessary. Is that correct?

I think you misunderstand the usage of external sockets. If we look at your use-case when you dump just a client process. On restore, you can start a new server, create a new client socket and restore the client process specifying the new client socket as the external one.

Idealist226 commented 4 days ago

I read the External_UNIX_socket blog post, which mentions the importance of the --external and --inherit-fd options for dumping and restoring Unix Domain Socket connections. However, based on your response, if we dump and restore both ends of the communication simultaneously, these options seem to be unnecessary. Is that correct?

I think you misunderstand the usage of external sockets. If we look at your use-case when you dump just a client process. On restore, you can start a new server, create a new client socket and restore the client process specifying the new client socket as the external one.

I misunderstood it before, thank you for your answer! Thank you for everyone's help! I will close this issue.