I'm trying to Implement this topic and using data from "Sentiment analysis (binary classification)."
But keep getting errors.
So far, I got the error "Schema mismatch for feature column 'Features': expected Vector<R4>, got Text."
(Search "//Error occurred on this line.")
Still can't figure out what the problem is.
Thanks for helping me~
Following is the code:
Program.cs
// <SnippetAddUsings>
using System;
using System.IO;
using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;
// </SnippetAddUsings>
namespace MLmultiPg
{
class Program
{
//pg:1.IssueClassification; 2.SentimentAnalysis
static int pg = 2;
static string pgName = "Issue";
// <SnippetDeclareGlobalVariables>
private static string _appPath => Path.GetDirectoryName(Environment.GetCommandLineArgs()[0]);
private static string _txtDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_txt.txt");
private static string _trainDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_train.tsv");
private static string _testDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_test.tsv");
private static string _modelPath => Path.Combine(_appPath, "..", "..", "..", "Models", pgName + "_model.zip");
private static MLContext _mlContext;
private static TrainCatalogBase.TrainTestData _trainTestData;
private static PredictionEngine<GitHubIssue, IssuePrediction> _predIssueEgn;
private static PredictionEngine<SentimentData, SentimentPrediction> _predStmEgn;
private static ITransformer _trainedModel;
static IDataView _trainingDataView;
// </SnippetDeclareGlobalVariables>
static void Main(string[] args)
{
// Create MLContext to be shared across the model creation workflow objects
// Set a random seed for repeatable/deterministic results across multiple trainings.
// <SnippetCreateMLContext>
_mlContext = new MLContext(seed: 0);
// </SnippetCreateMLContext>
setPg();
// <SnippetSplitData>
// var (trainData, testData) = _mlContext.MulticlassClassification.TrainTestSplit(_trainingDataView, testFraction: 0.1);
// </SnippetSplitData>
// <SnippetCallProcessData>
var pipeline = ProcessData();
// </SnippetCallProcessData>
// <SnippetCallBuildAndTrainModel>
var trainingPipeline = BuildAndTrainModel(_trainingDataView, pipeline);
// </SnippetCallBuildAndTrainModel>
// <SnippetCallEvaluate>
Evaluate();
Console.Read();
// </SnippetCallEvaluate>
// <SnippetCallPredictIssue>
//PredictIssue();
// </SnippetCallPredictIssue>
}
static private void setPg()
{
switch (pg)
{
case 1:
pgName = "issues";
//init();
setIssue();
break;
case 2:
pgName = "sentiment";
//init();
setStm();
break;
default:
break;
}
}
//static private void init()
//{
// _appPath = Path.GetDirectoryName(Environment.GetCommandLineArgs()[0]);
// _txtDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_txt.txt");
// _trainDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_train.tsv");
// _testDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_test.tsv");
// _modelPath = Path.Combine(_appPath, "..", "..", "..", "Models", pgName + "_model.zip");
//}
static private void setIssue()
{
// STEP 1: Common data loading configuration
// CreateTextReader<GitHubIssue>(hasHeader: true) - Creates a TextLoader by inferencing the dataset schema from the GitHubIssue data model type.
// .Read(_trainDataPath) - Loads the training text file into an IDataView (_trainingDataView) and maps from input columns to IDataView columns.
Console.WriteLine($"=============== Loading Dataset ===============");
// <SnippetLoadTrainData>
//_trainDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", "issues_train.tsv");
//_mlContext.Data.LoadFromTextFile<GitHubIssue>("C:\\angus\\temp\\IssuesClassification\\IssuesClassification\\Data\\issues_train.tsv", hasHeader: true);
_trainingDataView = _mlContext.Data.LoadFromTextFile<GitHubIssue>(_trainDataPath, hasHeader: true);
// </SnippetLoadTrainData>
Console.WriteLine($"=============== Finished Loading Dataset ===============");
}
static private void setStm()
{
IDataView dataView = _mlContext.Data.LoadFromTextFile<SentimentData>(_txtDataPath);
_trainTestData = _mlContext.BinaryClassification.TrainTestSplit(dataView, testFraction: 0.2);
_trainingDataView = _trainTestData.TrainSet;
}
public static IEstimator<ITransformer> ProcessData()
{
Console.WriteLine($"=============== Processing Data ===============");
// STEP 2: Common data process configuration with pipeline data transformations
// <SnippetMapValueToKey>
try
{
if (pg == 1) return issuePipe();
else if (pg == 2) return stmPipe();
else return null;
}
finally { Console.WriteLine($"=============== Finished Processing Data ==============="); }
// </SnippetAppendCache>
// <SnippetReturnPipeline>
//return pipeline;
// </SnippetReturnPipeline>
}
private static IEstimator<ITransformer> issuePipe()
{
var pipeline = _mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: "Area", outputColumnName: "Label")
// </SnippetMapValueToKey>
// <SnippetFeaturizeText>
.Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Title", outputColumnName: "TitleFeaturized"))
.Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Description", outputColumnName: "DescriptionFeaturized"))
// </SnippetFeaturizeText>
// <SnippetConcatenate>
.Append(_mlContext.Transforms.Concatenate("Features", "TitleFeaturized", "DescriptionFeaturized"))
// </SnippetConcatenate>
//Sample Caching the DataView so estimators iterating over the data multiple times, instead of always reading from file, using the cache might get better performance.
// <SnippetAppendCache>
.AppendCacheCheckpoint(_mlContext);
return pipeline;
}
private static IEstimator<ITransformer> stmPipe()
{
var pipeline = _mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: "Label", outputColumnName: "Label")
.Append(_mlContext.Transforms.Categorical.OneHotEncoding("SentimentText", "Features"))
//.Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Features", outputColumnName: "SentimentTextFeaturized"))
.AppendCacheCheckpoint(_mlContext);
//var pipeline = _mlContext.Transforms.Text.FeaturizeText(outputColumnName: DefaultColumnNames.Features, inputColumnName: nameof(SentimentData.SentimentText));
return pipeline;
}
public static IEstimator<ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator<ITransformer> pipeline)
{
// STEP 3: Create the training algorithm/trainer
// Use the multi-class SDCA algorithm to predict the label using features.
//Set the trainer/algorithm and map label to value (original readable state)
// <SnippetAddTrainer>
var trainingPipeline = pipeline.Append(_mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features))
.Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));
// </SnippetAddTrainer>
// STEP 4: Train the model fitting to the DataSet
Console.WriteLine($"=============== Training the model ===============");
//Error occoured on this line.
// <SnippetTrainModel>
_trainedModel = trainingPipeline.Fit(trainingDataView);
// </SnippetTrainModel>
Console.WriteLine($"=============== Finished Training the model Ending time: {DateTime.Now.ToString()} ===============");
// (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model)
Console.WriteLine($"=============== Single Prediction just-trained-model ===============");
try
{
if (pg == 1) issueEgn();
else if (pg == 2) stmEgn();
}
finally
{
}
// <SnippetReturnModel>
return trainingPipeline;
// </SnippetReturnModel>
}
private static void issueEgn()
{
// Create prediction engine related to the loaded trained model
// <SnippetCreatePredictionEngine1>
_predIssueEgn = _trainedModel.CreatePredictionEngine<GitHubIssue, IssuePrediction>(_mlContext);
// </SnippetCreatePredictionEngine1>
// <SnippetCreateTestIssue1>
GitHubIssue issue = new GitHubIssue()
{
Title = "WebSockets communication is slow in my machine",
Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.."
};
// </SnippetCreateTestIssue1>
// <SnippetPredict>
var prediction = _predIssueEgn.Predict(issue);
// </SnippetPredict>
// <SnippetOutputPrediction>
Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ===============");
// </SnippetOutputPrediction>
}
private static void stmEgn()
{
// Create prediction engine related to the loaded trained model
// <SnippetCreatePredictionEngine1>
_predStmEgn = _trainedModel.CreatePredictionEngine<SentimentData, SentimentPrediction>(_mlContext);
// </SnippetCreatePredictionEngine1>
// <SnippetCreateTest>
SentimentData stm = new SentimentData()
{
SentimentText = "How could you do that to me?"
};
// </SnippetCreateTest>
// <SnippetPredict>
var prediction = _predStmEgn.Predict(stm);
// </SnippetPredict>
// <SnippetOutputPrediction>
Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Prediction} ===============");
// </SnippetOutputPrediction>
}
public static void Evaluate()
{
// STEP 5: Evaluate the model in order to get the model's accuracy metrics
Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Starting time: {DateTime.Now.ToString()} ===============");
try
{
if (pg == 1) issueEvl();
else if (pg == 2) stmEvl();
}
finally
{
}
SaveModelAsFile(_mlContext, _trainedModel);
// </SnippetCallSaveModel>
}
private static void issueEvl()
{
//Load the test dataset into the IDataView
// <SnippetLoadTestDataset>
var testDataView = _mlContext.Data.LoadFromTextFile<GitHubIssue>(_testDataPath, hasHeader: true);
// </SnippetLoadTestDataset>
//Evaluate the model on a test dataset and calculate metrics of the model on the test data.
// <SnippetEvaluate>
var testMetrics = _mlContext.MulticlassClassification.Evaluate(_trainedModel.Transform(testDataView));
// </SnippetEvaluate>
Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Ending time: {DateTime.Now.ToString()} ===============");
// <SnippetDisplayMetrics>
Console.WriteLine($"*************************************************************************************************************");
Console.WriteLine($"* Metrics for Multi-class Classification model - Test Data ");
Console.WriteLine($"*------------------------------------------------------------------------------------------------------------");
Console.WriteLine($"* MicroAccuracy: {testMetrics.AccuracyMicro:0.###}");
Console.WriteLine($"* MacroAccuracy: {testMetrics.AccuracyMacro:0.###}");
Console.WriteLine($"* LogLoss: {testMetrics.LogLoss:#.###}");
Console.WriteLine($"* LogLossReduction: {testMetrics.LogLossReduction:#.###}");
Console.WriteLine($"*************************************************************************************************************");
// </SnippetDisplayMetrics>
// Save the new model to .ZIP file
// <SnippetCallSaveModel>
}
private static void stmEvl()
{
//Load the test dataset into the IDataView
// <SnippetLoadTestDataset>
var testDataView = _trainTestData.TestSet;
// </SnippetLoadTestDataset>
//Evaluate the model on a test dataset and calculate metrics of the model on the test data.
// <SnippetEvaluate>
var testMetrics = _mlContext.MulticlassClassification.Evaluate(_trainedModel.Transform(testDataView));
// </SnippetEvaluate>
Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Ending time: {DateTime.Now.ToString()} ===============");
// <SnippetDisplayMetrics>
Console.WriteLine($"*************************************************************************************************************");
Console.WriteLine($"* Metrics for Multi-class Classification model - Test Data ");
Console.WriteLine($"*------------------------------------------------------------------------------------------------------------");
Console.WriteLine($"* MicroAccuracy: {testMetrics.AccuracyMicro:0.###}");
Console.WriteLine($"* MacroAccuracy: {testMetrics.AccuracyMacro:0.###}");
Console.WriteLine($"* LogLoss: {testMetrics.LogLoss:#.###}");
Console.WriteLine($"* LogLossReduction: {testMetrics.LogLossReduction:#.###}");
Console.WriteLine($"*************************************************************************************************************");
// </SnippetDisplayMetrics>
// Save the new model to .ZIP file
// <SnippetCallSaveModel>
}
public static void PredictIssue()
{
// <SnippetLoadModel>
ITransformer loadedModel;
using (var stream = new FileStream(_modelPath, FileMode.Open, FileAccess.Read, FileShare.Read))
{
loadedModel = _mlContext.Model.Load(stream);
}
// </SnippetLoadModel>
//1080320 Angus test
Console.WriteLine("Please enter the title:");
string t = Console.ReadLine();
Console.WriteLine("Please enter the description:");
string des = Console.ReadLine();
GitHubIssue singleIssue = new GitHubIssue() { Title = t, Description = des };
// <SnippetAddTestIssue>
//GitHubIssue singleIssue = new GitHubIssue() { Title = "Entity Framework crashes", Description = "When connecting to the database, EF is crashing" };
// </SnippetAddTestIssue>
//Predict label for single hard-coded issue
// <SnippetCreatePredictionEngine>
_predIssueEgn = loadedModel.CreatePredictionEngine<GitHubIssue, IssuePrediction>(_mlContext);
// </SnippetCreatePredictionEngine>
// <SnippetPredictIssue>
var prediction = _predIssueEgn.Predict(singleIssue);
// </SnippetPredictIssue>
// <SnippetDisplayResults>
//Console.WriteLine($"=============== Single Prediction - Result: {prediction.Area} ===============");
// </SnippetDisplayResults>
Console.WriteLine($"Title:" + t);
Console.WriteLine($"Description:" + des);
Console.WriteLine($"=============== Prediction Result: {prediction.Area} ===============");
Console.Read();
}
private static void SaveModelAsFile(MLContext mlContext, ITransformer model)
{
// <SnippetSaveModel>
using (var fs = new FileStream(_modelPath, FileMode.Create, FileAccess.Write, FileShare.Write))
mlContext.Model.Save(model, fs);
// </SnippetSaveModel>
Console.WriteLine("The model is saved to {0}", _modelPath);
}
}
}
SentimentData.cs
// <SnippetAddUsings>
using Microsoft.ML.Data;
// </SnippetAddUsings>
namespace MLmultiPg
{
// <SnippetDeclareTypes>
public class SentimentData
{
[LoadColumn(0), ColumnName("Features")]
public string SentimentText;
[LoadColumn(1), ColumnName("Label")]
//[LoadColumn(1)]
public bool Sentiment;
}
public class SentimentPrediction
{
[ColumnName("PredictedLabel")]
public bool Prediction { get; set; }
// [ColumnName("Probability")]
public float Probability { get; set; }
// [ColumnName("Score")]
public float Score { get; set; }
}
// </SnippetDeclareTypes>
}
GitHubIssueData.cs
// <SnippetAddUsings>
using Microsoft.ML.Data;
// </SnippetAddUsings>
namespace MLmultiPg
{
// <SnippetDeclareTypes>
public class GitHubIssue
{
[LoadColumn(0)]
public string ID { get; set; }
[LoadColumn(1)]
public string Area { get; set; }
[LoadColumn(2)]
public string Title { get; set; }
[LoadColumn(3)]
public string Description { get; set; }
}
public class IssuePrediction
{
[ColumnName("PredictedLabel")]
public string Area;
}
// </SnippetDeclareTypes>
}
I'm trying to Implement this topic and using data from "Sentiment analysis (binary classification)." But keep getting errors. So far, I got the error "Schema mismatch for feature column 'Features': expected Vector<R4>, got Text." (Search "//Error occurred on this line.")
Still can't figure out what the problem is. Thanks for helping me~
Following is the code:
Program.cs
SentimentData.cs
GitHubIssueData.cs