microsoft / FASTER

Fast persistent recoverable log and key-value store + cache, in C# and C++.
https://aka.ms/FASTER
MIT License
6.29k stars 565 forks source link

Random crashes when server concurrent GC enabled (didn't fixed) #835

Closed oleg2k closed 1 year ago

oleg2k commented 1 year ago

Original issue: https://github.com/microsoft/FASTER/issues/832

Crashes are still here. Captured some dumps: https://drive.google.com/drive/folders/1cBJY0s909O2Jvk306WgodLkoEi-V2Hn8

badrishc commented 1 year ago

Does it happen when you replace Memory<byte> with SpanByte. The memory support is hacky due to refCache concept, as mentioned earlier. We are looking into how to fix that. But suggest you use SpanByte instead. If you see an issue with this, that would be a serious problem.

oleg2k commented 1 year ago

Yes, it crashes with SpanByte. Uploaded two dumps: https://drive.google.com/drive/folders/10STHa5optzPspMe9HbOzWJNWgNhZlidC

using FASTER.core;
using System.Buffers.Binary;
using System.Buffers;

namespace ConsoleApp73;

using MySession = ClientSession<SpanByte, SpanByte, SpanByte, bool, Empty, ReducedUserDataFunctions>;

internal class Program
{
    static void Main()
    {
        var sourceDbPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "source");

        WriteSimpleSourceDb(sourceDbPath);

        for (var i = 0; i < 100; i++)
        {
            var dbPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, $"faster-{i}");
            try
            {
                Write(sourceDbPath, dbPath);
            }
            finally
            {
                try
                {
                    if (Directory.Exists(dbPath))
                        Directory.Delete(dbPath, true);
                }
                catch
                {
                    // ignore
                }
            }
        }
    }

    private static readonly Random random = new(1);

    private const int ValueSize = 1024;
    private const int SecondaryIndexCnt = 5;

    private static void WriteSimpleSourceDb(string sourceDbPath)
    {
        Console.WriteLine($"Source: {sourceDbPath}");

        if (Directory.Exists(sourceDbPath))
            Directory.Delete(sourceDbPath, true);

        using var fasterLogSettings = new FasterLogSettings(sourceDbPath);
        using var fasterLog = new FasterLog(fasterLogSettings);

        var valueBuffer = new byte[ValueSize];

        for (var index = 0; index < 500_000; index++)
        {
            random.NextBytes(valueBuffer);

            fasterLog.Enqueue(valueBuffer);
        }

        fasterLog.Commit(true);
    }

    private static void Write(string sourceDbPath, string dbPath)
    {
        Console.WriteLine(dbPath);

        if (Directory.Exists(dbPath))
            Directory.Delete(dbPath, true);

        var mainDbPath = Path.Combine(dbPath, "main");

        var tempIndexDbDirectory = Path.Combine(dbPath, "TempIndex");

        using var mainLogDevice = Devices.CreateLogDevice(mainDbPath);
        using var mainKvSettings = new FasterKVSettings<SpanByte, SpanByte> { LogDevice = mainLogDevice, PageSize = 1 * 1024 * 1024, SegmentSize = 32 * 1024 * 1024, CheckpointDir = mainDbPath };
        using var mainKv = new FasterKV<SpanByte, SpanByte>(mainKvSettings);

        var indexDevices = new List<IDevice>();
        var indexSettings = new List<FasterKVSettings<SpanByte, SpanByte>>();
        var indexKvs = new List<FasterKV<SpanByte, SpanByte>>();

        for (var i = 0; i < SecondaryIndexCnt; i++)
        {
            var indexPath = Path.Combine(tempIndexDbDirectory, $"i_{i}");

            var indexLogDevice = Devices.CreateLogDevice(indexPath);
            var indexKvSettings = new FasterKVSettings<SpanByte, SpanByte> { LogDevice = indexLogDevice, PageSize = 1 * 1024 * 1024, SegmentSize = 32 * 1024 * 1024, CheckpointDir = indexPath };
            var indexKv = new FasterKV<SpanByte, SpanByte>(indexKvSettings);

            indexDevices.Add(indexLogDevice);
            indexSettings.Add(indexKvSettings);
            indexKvs.Add(indexKv);
        }

        {
            using var mainKvSession = mainKv.For(ReducedUserDataFunctions.Instance).NewSession<ReducedUserDataFunctions>();

            var indexSessions = new List<MySession>();

            foreach (var indexKv in indexKvs)
            {
                indexSessions.Add(indexKv.For(ReducedUserDataFunctions.Instance).NewSession<ReducedUserDataFunctions>());
            }

            using var sourceLogSettings = new FasterLogSettings(sourceDbPath) { ReadOnlyMode = true };
            using var sourceLog = new FasterLog(sourceLogSettings);

            var recordCounter = 0;

            using var sourceIterator = sourceLog.Scan(sourceLog.BeginAddress, long.MaxValue);

            Span<byte> primaryKeyFixedBuffer = stackalloc byte[8];
            Span<byte> valueFixedBuffer = stackalloc byte[ValueSize];

            while (sourceIterator.GetNext(MemoryPool<byte>.Shared, out var sourceValueMemoryOwner, out var wholeLength, out var address))
            {
                recordCounter++;

                using (sourceValueMemoryOwner)
                {
                    if (wholeLength != ValueSize)
                        throw new InvalidOperationException("Wrong length");

                    sourceValueMemoryOwner.Memory.Span.Slice(0, wholeLength).CopyTo(valueFixedBuffer);
                }

                BinaryPrimitives.TryWriteInt64BigEndian(primaryKeyFixedBuffer, address);

                var secondaryKeySpans = new List<SpanByte>();

                for (var i = 0; i < SecondaryIndexCnt; i++)
                {
                    secondaryKeySpans.Add(SpanByte.FromFixedSpan(valueFixedBuffer.Slice(i * 4, 4)));
                }

                var primaryKeySpan = SpanByte.FromFixedSpan(primaryKeyFixedBuffer);

                if (TryReadReduced(mainKvSession, primaryKeySpan))
                {
                }

                Write(mainKvSession, primaryKeySpan, SpanByte.FromFixedSpan(valueFixedBuffer));

                for (var i = 0; i < SecondaryIndexCnt; i++)
                {
                    if (TryReadReduced(indexSessions[i], secondaryKeySpans[i]))
                    {
                    }

                    Write(indexSessions[i], secondaryKeySpans[i], primaryKeySpan);
                }

                if (recordCounter % 100_000 == 0)
                {
                    Console.WriteLine($"{recordCounter}...");
                }
            }

            Console.WriteLine($"{recordCounter} done");

            foreach (var indexSession in indexSessions) indexSession.Dispose();

            foreach (var indexKv in indexKvs)
            {
                indexKv.TakeHybridLogCheckpointAsync(CheckpointType.FoldOver, true).GetAwaiter().GetResult();
            }
        }

        mainKv.TakeHybridLogCheckpointAsync(CheckpointType.FoldOver, true).GetAwaiter().GetResult();

        foreach (var indexKv in indexKvs) indexKv.Dispose();
        foreach (var indexSetting in indexSettings) indexSetting.Dispose();
        foreach (var indexDevice in indexDevices) indexDevice.Dispose();
    }

    private static bool TryReadReduced(MySession session, SpanByte key)
    {
        var (status, _) = session.Read(key);

        if (status.Found)
        {
            return true;
        }

        //if (status.IsPending && session.CompletePendingWithOutputs(out var outputs, true))
        if (status.IsPending && session.CompletePending(true))
        {
            return true;
            //using (outputs)
            //{
            //    if (outputs.Next())
            //    {
            //        return true;
            //    }
            //}
        }

        return false;
    }

    private static void Write(MySession session, SpanByte key, SpanByte value)
    {
        var status = session.Upsert(key, value);

        if (status.IsPending)
        {
            session.CompletePending(true);
        }
    }
}

internal sealed class ReducedUserDataFunctions : SpanByteFunctions<SpanByte, bool, Empty>
{
    public static readonly ReducedUserDataFunctions Instance = new();

    private ReducedUserDataFunctions()
    {
    }

    public override bool SingleReader(ref SpanByte key, ref SpanByte input, ref SpanByte value, ref bool dst, ref ReadInfo readInfo)
    {
        dst = true;
        return true;
    }

    public override bool ConcurrentReader(ref SpanByte key, ref SpanByte input, ref SpanByte value, ref bool dst, ref ReadInfo readInfo)
    {
        dst = true;
        return true;
    }
}
badrishc commented 1 year ago

We are unable to reproduce it yet. Couple questions:

  1. Can you try to minimize the repro, e.g., does it happen if FasterLog is commented out?
  2. Can you see if settings can be tweaked to make it happen more quickly and reliably?
oleg2k commented 1 year ago

I will try. By the way, on Linux it happens more quickly

badrishc commented 1 year ago

Can you try with version 2.5.1 commit 6bc97ee and see if the issue is there as well. Checking if it could be related to the new work on push iterators.

oleg2k commented 1 year ago

Some info about testing on previous variation (with FasterLog):

                                net6.0      net7.0

Desktop i9-12900K, 64GB, Win-11                 ok      occasionally crashes

    Hyper-V Ubuntu 22.04.2                  not tested  occasionally crashes

ThinkPad i7-8565U, 16GB, Win-11                 n/t     rarely crashes (1 or 2 times)

i7-3770K, 32GB, win-10                      n/t     ok

i5-10500, 32GB, win-10                      n/t     ok

Mac M1 16GB (test built with VS 2022 for Mac Preview)       n/t     ok

If there were no crashes on the laptop, I would think about broken RAM on desktop (ran memtest86 anyway, it's ok).

Below is new test variation (without FasterLog) always crashes on i9-12900K, almost instantly after start. This new test variation contains some feed for GC, no crashes without it.

Tried different versions of FASTER (including 2.5.1 commit 6bc97ee) with this variation.

It seems test crashes on all versions starting from 1.9.8, and only on net7.0.

No crashes on version 1.9.7.

Test-1.9.x.cs

Test-2.x.cs

badrishc commented 1 year ago

Any progress with narrowing down the surface of things running, plus a more reliable repro. I havent been able to make it happen. You can try varying various tuning parameters to see if the repro get better.

badrishc commented 1 year ago

It seems this issue is unrelated to FASTER itself, but is a problem in .NET 7 with server GC. Here is a stand alone repro with no FASTER involved - we are in touch with the .NET team to determine the root cause and solution for it. Thanks!

using System.Text;

namespace ServerConcurrentGC;

internal class Program
{
    private const int FeedGCIter = 6_000 * 500;

    internal static void Main(string[] args)
    {
        for (var lap = 0; lap < 10; lap++)
        {
            Write(lap);
        }
    }

    private static void Write(int lap)
    {
        for (var i = 0; i < 5; i++)
        {
            new DummyFaster();
        }

        for (var i = 0; i < FeedGCIter; i++)
        {
            new StringBuilder(100);
        }
        Console.WriteLine($"{lap}: {FeedGCIter} done");
    }

    class DummyFaster
    {
        const int BufferSize = 16384;
        readonly byte[][] values;
        readonly FullPageStatus[] PageStatusIndicator;
        readonly PendingFlushList[] PendingFlush;

        public DummyFaster()
        {
            values = new byte[BufferSize][];
            PageStatusIndicator = new FullPageStatus[BufferSize];
            PendingFlush = new PendingFlushList[BufferSize];
            for (int i = 0; i < BufferSize; i++)
                PendingFlush[i] = new PendingFlushList();
        }
    }

    public struct FullPageStatus
    {
        public long LastFlushedUntilAddress;
        public long Dirty;
    }

    class PendingFlushList
    {
        public TestClass list;

        public PendingFlushList()
        {
            list = new();
        }
    }

    class TestClass
    {
    }
}
badrishc commented 1 year ago

Closing as it's verified to be an issue outside FASTER.