tursodatabase / libsql

libSQL is a fork of SQLite that is both Open Source, and Open Contributions.
https://turso.tech/libsql
MIT License
11.22k stars 286 forks source link

Sync hangs for a very long time when using archived database #1676

Closed haaawk closed 2 months ago

haaawk commented 3 months ago

Having Cargo.toml:

[package]
name = "latencies-test"
version = "0.1.0"
edition = "2021"

[dependencies]
libsql = { git = "https://github.com/tursodatabase/libsql.git" }
tokio = { version = "1", features = ["full"] }

and main.rs:

use std::time::Instant;
use libsql::Builder;

#[tokio::main]
async fn main() {
    let url = "libsql://foo-ospfranco.turso.io";
    let token = "<token>";
      let db = Builder::new_remote_replica("test.db", url.into(), token.into())
          .build()
          .await
          .expect("failed to connect to db");

      let before = Instant::now();
      let conn = db.connect().expect("failed to connect to db");
      let after = Instant::now();
      println!("Time to connect: {:?}", after.duration_since(before));

      let before = Instant::now();
      let res = db.sync().await;
      let after = Instant::now();
      println!("Time to sync: {:?}", after.duration_since(before));
      res.expect("failed to sync db");

      let before = Instant::now();
      let _ = conn.query("select emp_no, first_name, last_name from employees limit 10", ()).await.expect("failed to query db");
      let after = Instant::now();
      println!("Time to query: {:?}", after.duration_since(before));
}

the program returns:

Time to connect: 1.128602ms
Time to sync: 1108.274159712s
thread 'main' panicked at src/main.rs:31:13:
failed to sync db: Replication(PrimaryHandshakeTimeout)
stack backtrace:
   0: rust_begin_unwind
             at /rustc/3f5fd8dd41153bc5fdca9427e9e05be2c767ba23/library/std/src/panicking.rs:652:5
   1: core::panicking::panic_fmt
             at /rustc/3f5fd8dd41153bc5fdca9427e9e05be2c767ba23/library/core/src/panicking.rs:72:14
   2: core::result::unwrap_failed
             at /rustc/3f5fd8dd41153bc5fdca9427e9e05be2c767ba23/library/core/src/result.rs:1679:5
   3: core::result::Result<T,E>::expect
             at /rustc/3f5fd8dd41153bc5fdca9427e9e05be2c767ba23/library/core/src/result.rs:1059:23
   4: latencies_test::main::{{closure}}
             at ./src/main.rs:31:9
   5: <core::pin::Pin<P> as core::future::future::Future>::poll
             at /rustc/3f5fd8dd41153bc5fdca9427e9e05be2c767ba23/library/core/src/future/future.rs:123:9
   6: tokio::runtime::park::CachedParkThread::block_on::{{closure}}
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/park.rs:281:63
   7: tokio::runtime::coop::with_budget
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/coop.rs:107:5
   8: tokio::runtime::coop::budget
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/coop.rs:73:5
   9: tokio::runtime::park::CachedParkThread::block_on
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/park.rs:281:31
  10: tokio::runtime::context::blocking::BlockingRegionGuard::block_on
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/context/blocking.rs:66:9
  11: tokio::runtime::scheduler::multi_thread::MultiThread::block_on::{{closure}}
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/scheduler/multi_thread/mod.rs:87:13
  12: tokio::runtime::context::runtime::enter_runtime
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/context/runtime.rs:65:16
  13: tokio::runtime::scheduler::multi_thread::MultiThread::block_on
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/scheduler/multi_thread/mod.rs:86:9
  14: tokio::runtime::runtime::Runtime::block_on_inner
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/runtime.rs:363:45
  15: tokio::runtime::runtime::Runtime::block_on
             at /Users/haaawk/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.39.2/src/runtime/runtime.rs:333:13
  16: latencies_test::main
             at ./src/main.rs:10:5
  17: core::ops::function::FnOnce::call_once
             at /rustc/3f5fd8dd41153bc5fdca9427e9e05be2c767ba23/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

Timeouts must be misconfigured on the hello request I assume.

haaawk commented 2 months ago

Some more logs:

2024-08-19T08:12:51.783229Z DEBUG libsql_replication::replicator: Attempting to perform handshake with primary.
2024-08-19T08:12:51.783929Z  INFO libsql::replication::remote_client: Attempting to perform handshake with primary.
2024-08-19T08:12:51.788755Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: tower_http::trace::on_request: started processing request
2024-08-19T08:12:51.788804Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: tonic_web::client: coercing HTTP2 request to HTTP1.1
2024-08-19T08:12:51.791811Z DEBUG hyper::client::connect::dns: resolving host="foo-ospfranco.turso.io"
2024-08-19T08:12:51.862667Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: hyper::client::connect::http: connecting to 66.51.123.211:443
2024-08-19T08:12:51.905776Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: hyper::client::connect::http: connected to 66.51.123.211:443
2024-08-19T08:12:51.906370Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: rustls::client::hs: No cached session for DnsName("foo-ospfranco.turso.io")    
2024-08-19T08:12:51.908643Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: rustls::client::hs: Not resuming any session    
2024-08-19T08:12:51.964482Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: rustls::client::hs: Using ciphersuite TLS13_AES_256_GCM_SHA384    
2024-08-19T08:12:51.965500Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: rustls::client::tls13: Not resuming    
2024-08-19T08:12:51.966351Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: rustls::client::tls13: TLS1.3 encrypted extensions: [ServerNameAck]    
2024-08-19T08:12:51.966402Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: rustls::client::hs: ALPN protocol is None    
2024-08-19T08:12:51.979113Z DEBUG hyper::proto::h1::io: flushed 497 bytes
2024-08-19T08:13:02.205164Z DEBUG hyper::proto::h1::io: parsed 7 headers
2024-08-19T08:13:02.205225Z DEBUG hyper::proto::h1::conn: incoming body is empty
2024-08-19T08:13:02.205940Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: hyper::client::pool: pooling idle connection for ("https", foo-ospfranco.turso.io)
2024-08-19T08:13:02.206084Z DEBUG request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: tower_http::trace::on_response: finished processing request latency=10418 ms status=200
2024-08-19T08:13:02.206137Z ERROR request{method=POST uri=https://foo-ospfranco.turso.io/wal_log.ReplicationLog/Hello version=HTTP/2.0}: tower_http::trace::on_failure: response failed classification=Code: 14 latency=10418 ms
2024-08-19T08:13:02.206317Z  WARN libsql::replication::remote_client: Failed handshake in 10422 ms: Client(Status { code: Unavailable, message: "bad gateway", metadata: MetadataMap { headers: {"fly-request-id": "01J5MV4D40M7PFB2GCPAZY2WPX-fra", "content-length": "0", "date": "Mon, 19 Aug 2024 08:13:02 GMT", "server": "Fly/f8a8cc0dd (2024-08-15)", "via": "1.1 fly.io"} }, source: None })
2024-08-19T08:13:02.206367Z ERROR libsql_replication::replicator: error connecting to primary. retrying. error: status: Unavailable, message: "bad gateway", details: [], metadata: MetadataMap { headers: {"fly-request-id": "01J5MV4D40M7PFB2GCPAZY2WPX-fra", "content-length": "0", "date": "Mon, 19 Aug 2024 08:13:02 GMT", "server": "Fly/f8a8cc0dd (2024-08-15)", "via": "1.1 fly.io"} }
2024-08-19T08:13:03.207651Z DEBUG libsql_replication::replicator: Attempting to perform handshake with primary.
2024-08-19T08:13:03.207716Z  INFO libsql::replication::remote_client: Attempting to perform handshake with primary.
haaawk commented 2 months ago

the request fails fast but we retry 100 times with 1s sleep in between. @LucioFranco why do we rety for so long? In this case the db is just dead. Is there a way to be able to return an error quicker?