const string AssetsRelativePath = "assets";
MLContext mlContext = new MLContext();
IEnumerable<ImageData> images = LoadImagesFromDirectory(folder: AssetsRelativePath, useFolderNameAsLabel: true);
IDataView imageData = mlContext.Data.LoadFromEnumerable(images);
IDataView shuffledData = mlContext.Data.ShuffleRows(imageData);
var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey(
inputColumnName: "Label",
outputColumnName: "LabelAsKey")
.Append(mlContext.Transforms.LoadRawImageBytes(
outputColumnName: "Image",
imageFolder: AssetsRelativePath,
inputColumnName: "ImagePath"));
IDataView preProcessedData = preprocessingPipeline
.Fit(shuffledData)
.Transform(shuffledData);
DataDebuggerPreview dataCount = preProcessedData.Preview(); // RowView = 24, correct!
TrainTestData trainSplit = mlContext.Data.TrainTestSplit(data: preProcessedData, testFraction: 0.3); // Split train data 70%/30%
TrainTestData validationTestSplit = mlContext.Data.TrainTestSplit(trainSplit.TestSet, testFraction: 0.1); // Rest of 30% into 90%/10%
DataDebuggerPreview trainSplitTrainSetCount = trainSplit.TrainSet.Preview(); // RowView = 19 <= Should be 16 or 17 depending on rounding (70% of 24 images = 16.799999999999997%)
DataDebuggerPreview trainSplitTestSetCount = trainSplit.TestSet.Preview(); // RowView = 5 <= Should be 6 or 7 depending on rounding (30% of 24 images = 7.199999999999999%)
DataDebuggerPreview validationTrainSetCount = validationTestSplit.TrainSet.Preview(); // RowView = 5
DataDebuggerPreview validationTestSetCount = validationTestSplit.TestSet.Preview(); // RowView = 0 <= Not sure to understand why 0 here
Following the preprocessing phase, the number of images is correct (24). However, following the various TrainTestSplit(), the data doesn't seem to be split correctly according to the fraction requested. Finally, I end up with an empty test dataset. You can find the details of each count at the end of the sample.
Is this the expected behaviour or did I misunderstand something?
System Information (please complete the following information)
Describe the bug
I'm following this tutorial: https://learn.microsoft.com/dotnet/machine-learning/tutorials/image-classification-api-transfer-learning. The assert directory I used contains 24 images.
Following the preprocessing phase, the number of images is correct (24). However, following the various
TrainTestSplit()
, the data doesn't seem to be split correctly according to the fraction requested. Finally, I end up with an empty test dataset. You can find the details of each count at the end of the sample.Is this the expected behaviour or did I misunderstand something?
System Information (please complete the following information)