dotnet / docs

This repository contains .NET Documentation.
https://learn.microsoft.com/dotnet
Creative Commons Attribution 4.0 International
4.27k stars 5.9k forks source link

Can we implement this topic by importing data from "Sentiment analysis (binary classification)?" #11555

Closed Angusonly closed 5 years ago

Angusonly commented 5 years ago

I'm trying to Implement this topic and using data from "Sentiment analysis (binary classification)." But keep getting errors. So far, I got the error "Schema mismatch for feature column 'Features': expected Vector<R4>, got Text." (Search "//Error occurred on this line.")

Still can't figure out what the problem is. Thanks for helping me~

Following is the code:

Program.cs

// <SnippetAddUsings>
using System;
using System.IO;
using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;
// </SnippetAddUsings>

namespace MLmultiPg
{
    class Program
    {
        //pg:1.IssueClassification; 2.SentimentAnalysis
        static int pg = 2;
        static string pgName = "Issue";

        // <SnippetDeclareGlobalVariables>
        private static string _appPath => Path.GetDirectoryName(Environment.GetCommandLineArgs()[0]);
        private static string _txtDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_txt.txt");
        private static string _trainDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_train.tsv");
        private static string _testDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_test.tsv");
        private static string _modelPath => Path.Combine(_appPath, "..", "..", "..", "Models", pgName + "_model.zip");

        private static MLContext _mlContext;
        private static TrainCatalogBase.TrainTestData _trainTestData;
        private static PredictionEngine<GitHubIssue, IssuePrediction> _predIssueEgn;
        private static PredictionEngine<SentimentData, SentimentPrediction> _predStmEgn;
        private static ITransformer _trainedModel;
        static IDataView _trainingDataView;
        // </SnippetDeclareGlobalVariables>
        static void Main(string[] args)
        {

            // Create MLContext to be shared across the model creation workflow objects 
            // Set a random seed for repeatable/deterministic results across multiple trainings.
            // <SnippetCreateMLContext>
            _mlContext = new MLContext(seed: 0);
            // </SnippetCreateMLContext>
            setPg();

            // <SnippetSplitData>
            //   var (trainData, testData) = _mlContext.MulticlassClassification.TrainTestSplit(_trainingDataView, testFraction: 0.1);
            // </SnippetSplitData>

            // <SnippetCallProcessData>
            var pipeline = ProcessData();
            // </SnippetCallProcessData>

            // <SnippetCallBuildAndTrainModel>
            var trainingPipeline = BuildAndTrainModel(_trainingDataView, pipeline);
            // </SnippetCallBuildAndTrainModel>

            // <SnippetCallEvaluate>
            Evaluate();
            Console.Read();
            // </SnippetCallEvaluate>

            // <SnippetCallPredictIssue>
            //PredictIssue();
            // </SnippetCallPredictIssue>
        }

        static private void setPg()
        {
            switch (pg)
            {
                case 1:
                    pgName = "issues";
                    //init();
                    setIssue();
                    break;
                case 2:
                    pgName = "sentiment";
                    //init();
                    setStm();
                    break;
                default:
                    break;
            }
        }
        //static private void init()
        //{
        //    _appPath = Path.GetDirectoryName(Environment.GetCommandLineArgs()[0]);
        //    _txtDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_txt.txt");
        //    _trainDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_train.tsv");
        //    _testDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", pgName + "_test.tsv");
        //    _modelPath = Path.Combine(_appPath, "..", "..", "..", "Models", pgName + "_model.zip");
        //}
        static private void setIssue()
        {
            // STEP 1: Common data loading configuration 
            // CreateTextReader<GitHubIssue>(hasHeader: true) - Creates a TextLoader by inferencing the dataset schema from the GitHubIssue data model type.
            // .Read(_trainDataPath) - Loads the training text file into an IDataView (_trainingDataView) and maps from input columns to IDataView columns.
            Console.WriteLine($"=============== Loading Dataset  ===============");

            // <SnippetLoadTrainData>
            //_trainDataPath = Path.Combine(_appPath, "..", "..", "..", "Data", "issues_train.tsv");
            //_mlContext.Data.LoadFromTextFile<GitHubIssue>("C:\\angus\\temp\\IssuesClassification\\IssuesClassification\\Data\\issues_train.tsv", hasHeader: true);
            _trainingDataView = _mlContext.Data.LoadFromTextFile<GitHubIssue>(_trainDataPath, hasHeader: true);
            // </SnippetLoadTrainData>

            Console.WriteLine($"=============== Finished Loading Dataset  ===============");
        }
        static private void setStm()
        {
            IDataView dataView = _mlContext.Data.LoadFromTextFile<SentimentData>(_txtDataPath);
            _trainTestData = _mlContext.BinaryClassification.TrainTestSplit(dataView, testFraction: 0.2);
            _trainingDataView = _trainTestData.TrainSet;

        }

        public static IEstimator<ITransformer> ProcessData()
        {
            Console.WriteLine($"=============== Processing Data ===============");
            // STEP 2: Common data process configuration with pipeline data transformations
            // <SnippetMapValueToKey>

            try
            {
                if (pg == 1) return issuePipe();
                else if (pg == 2) return stmPipe();
                else return null;
            }
            finally { Console.WriteLine($"=============== Finished Processing Data ==============="); }

            // </SnippetAppendCache>

            // <SnippetReturnPipeline>
            //return pipeline;
            // </SnippetReturnPipeline>
        }
        private static IEstimator<ITransformer> issuePipe()
        {
            var pipeline = _mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: "Area", outputColumnName: "Label")
                            // </SnippetMapValueToKey>
                            // <SnippetFeaturizeText>
                            .Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Title", outputColumnName: "TitleFeaturized"))
                            .Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Description", outputColumnName: "DescriptionFeaturized"))
                            // </SnippetFeaturizeText>
                            // <SnippetConcatenate>
                            .Append(_mlContext.Transforms.Concatenate("Features", "TitleFeaturized", "DescriptionFeaturized"))
                            // </SnippetConcatenate>
                            //Sample Caching the DataView so estimators iterating over the data multiple times, instead of always reading from file, using the cache might get better performance.
                            // <SnippetAppendCache>
                            .AppendCacheCheckpoint(_mlContext);
            return pipeline;
        }
        private static IEstimator<ITransformer> stmPipe()
        {
            var pipeline = _mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: "Label", outputColumnName: "Label")
                            .Append(_mlContext.Transforms.Categorical.OneHotEncoding("SentimentText", "Features"))
                            //.Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Features", outputColumnName: "SentimentTextFeaturized"))
                            .AppendCacheCheckpoint(_mlContext);
            //var pipeline = _mlContext.Transforms.Text.FeaturizeText(outputColumnName: DefaultColumnNames.Features, inputColumnName: nameof(SentimentData.SentimentText));
            return pipeline;
        }

        public static IEstimator<ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator<ITransformer> pipeline)
        {
            // STEP 3: Create the training algorithm/trainer
            // Use the multi-class SDCA algorithm to predict the label using features.
            //Set the trainer/algorithm and map label to value (original readable state)
            // <SnippetAddTrainer> 
            var trainingPipeline = pipeline.Append(_mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features))
                    .Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));
            // </SnippetAddTrainer> 

            // STEP 4: Train the model fitting to the DataSet
            Console.WriteLine($"=============== Training the model  ===============");

            //Error occoured on this line.
            // <SnippetTrainModel> 
            _trainedModel = trainingPipeline.Fit(trainingDataView);
            // </SnippetTrainModel> 
            Console.WriteLine($"=============== Finished Training the model Ending time: {DateTime.Now.ToString()} ===============");

            // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model)
            Console.WriteLine($"=============== Single Prediction just-trained-model ===============");

            try
            {
                if (pg == 1) issueEgn();
                else if (pg == 2) stmEgn();
            }
            finally
            {
            }

            // <SnippetReturnModel>
            return trainingPipeline;
            // </SnippetReturnModel>
        }
        private static void issueEgn()
        {
            // Create prediction engine related to the loaded trained model
            // <SnippetCreatePredictionEngine1>
            _predIssueEgn = _trainedModel.CreatePredictionEngine<GitHubIssue, IssuePrediction>(_mlContext);
            // </SnippetCreatePredictionEngine1>
            // <SnippetCreateTestIssue1> 
            GitHubIssue issue = new GitHubIssue()
            {
                Title = "WebSockets communication is slow in my machine",
                Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.."
            };
            // </SnippetCreateTestIssue1>

            // <SnippetPredict>
            var prediction = _predIssueEgn.Predict(issue);
            // </SnippetPredict>

            // <SnippetOutputPrediction>
            Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ===============");
            // </SnippetOutputPrediction>

        }
        private static void stmEgn()
        {
            // Create prediction engine related to the loaded trained model
            // <SnippetCreatePredictionEngine1>
            _predStmEgn = _trainedModel.CreatePredictionEngine<SentimentData, SentimentPrediction>(_mlContext);
            // </SnippetCreatePredictionEngine1>
            // <SnippetCreateTest> 

            SentimentData stm = new SentimentData()
            {
                SentimentText = "How could you do that to me?"
            };

            // </SnippetCreateTest>

            // <SnippetPredict>
            var prediction = _predStmEgn.Predict(stm);
            // </SnippetPredict>

            // <SnippetOutputPrediction>
            Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Prediction} ===============");
            // </SnippetOutputPrediction>
        }

        public static void Evaluate()
        {
            // STEP 5:  Evaluate the model in order to get the model's accuracy metrics
            Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Starting time: {DateTime.Now.ToString()} ===============");

            try
            {
                if (pg == 1) issueEvl();
                else if (pg == 2) stmEvl();
            }
            finally
            {
            }

            SaveModelAsFile(_mlContext, _trainedModel);
            // </SnippetCallSaveModel>

        }
        private static void issueEvl()
        {
            //Load the test dataset into the IDataView
            // <SnippetLoadTestDataset>
            var testDataView = _mlContext.Data.LoadFromTextFile<GitHubIssue>(_testDataPath, hasHeader: true);
            // </SnippetLoadTestDataset>

            //Evaluate the model on a test dataset and calculate metrics of the model on the test data.
            // <SnippetEvaluate>
            var testMetrics = _mlContext.MulticlassClassification.Evaluate(_trainedModel.Transform(testDataView));
            // </SnippetEvaluate>

            Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Ending time: {DateTime.Now.ToString()} ===============");
            // <SnippetDisplayMetrics>
            Console.WriteLine($"*************************************************************************************************************");
            Console.WriteLine($"*       Metrics for Multi-class Classification model - Test Data     ");
            Console.WriteLine($"*------------------------------------------------------------------------------------------------------------");
            Console.WriteLine($"*       MicroAccuracy:    {testMetrics.AccuracyMicro:0.###}");
            Console.WriteLine($"*       MacroAccuracy:    {testMetrics.AccuracyMacro:0.###}");
            Console.WriteLine($"*       LogLoss:          {testMetrics.LogLoss:#.###}");
            Console.WriteLine($"*       LogLossReduction: {testMetrics.LogLossReduction:#.###}");
            Console.WriteLine($"*************************************************************************************************************");
            // </SnippetDisplayMetrics>

            // Save the new model to .ZIP file
            // <SnippetCallSaveModel>
        }
        private static void stmEvl()
        {
            //Load the test dataset into the IDataView
            // <SnippetLoadTestDataset>
            var testDataView = _trainTestData.TestSet;
            // </SnippetLoadTestDataset>

            //Evaluate the model on a test dataset and calculate metrics of the model on the test data.
            // <SnippetEvaluate>
            var testMetrics = _mlContext.MulticlassClassification.Evaluate(_trainedModel.Transform(testDataView));
            // </SnippetEvaluate>

            Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Ending time: {DateTime.Now.ToString()} ===============");
            // <SnippetDisplayMetrics>
            Console.WriteLine($"*************************************************************************************************************");
            Console.WriteLine($"*       Metrics for Multi-class Classification model - Test Data     ");
            Console.WriteLine($"*------------------------------------------------------------------------------------------------------------");
            Console.WriteLine($"*       MicroAccuracy:    {testMetrics.AccuracyMicro:0.###}");
            Console.WriteLine($"*       MacroAccuracy:    {testMetrics.AccuracyMacro:0.###}");
            Console.WriteLine($"*       LogLoss:          {testMetrics.LogLoss:#.###}");
            Console.WriteLine($"*       LogLossReduction: {testMetrics.LogLossReduction:#.###}");
            Console.WriteLine($"*************************************************************************************************************");
            // </SnippetDisplayMetrics>

            // Save the new model to .ZIP file
            // <SnippetCallSaveModel>
        }

        public static void PredictIssue()
        {
            // <SnippetLoadModel>
            ITransformer loadedModel;
            using (var stream = new FileStream(_modelPath, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                loadedModel = _mlContext.Model.Load(stream);
            }
            // </SnippetLoadModel>

            //1080320 Angus test
            Console.WriteLine("Please enter the title:");
            string t = Console.ReadLine();

            Console.WriteLine("Please enter the description:");
            string des = Console.ReadLine();

            GitHubIssue singleIssue = new GitHubIssue() { Title = t, Description = des };

            // <SnippetAddTestIssue> 
            //GitHubIssue singleIssue = new GitHubIssue() { Title = "Entity Framework crashes", Description = "When connecting to the database, EF is crashing" };
            // </SnippetAddTestIssue> 

            //Predict label for single hard-coded issue
            // <SnippetCreatePredictionEngine>
            _predIssueEgn = loadedModel.CreatePredictionEngine<GitHubIssue, IssuePrediction>(_mlContext);
            // </SnippetCreatePredictionEngine>

            // <SnippetPredictIssue>
            var prediction = _predIssueEgn.Predict(singleIssue);
            // </SnippetPredictIssue>

            // <SnippetDisplayResults>
            //Console.WriteLine($"=============== Single Prediction - Result: {prediction.Area} ===============");
            // </SnippetDisplayResults>

            Console.WriteLine($"Title:" + t);
            Console.WriteLine($"Description:" + des);
            Console.WriteLine($"=============== Prediction Result: {prediction.Area} ===============");

            Console.Read();

        }

        private static void SaveModelAsFile(MLContext mlContext, ITransformer model)
        {
            // <SnippetSaveModel> 
            using (var fs = new FileStream(_modelPath, FileMode.Create, FileAccess.Write, FileShare.Write))
                mlContext.Model.Save(model, fs);
            // </SnippetSaveModel>

            Console.WriteLine("The model is saved to {0}", _modelPath);
        }

    }
}

SentimentData.cs

// <SnippetAddUsings>
using Microsoft.ML.Data;
// </SnippetAddUsings>

namespace MLmultiPg
{
    // <SnippetDeclareTypes>
    public class SentimentData
    {
        [LoadColumn(0), ColumnName("Features")]
        public string SentimentText;

        [LoadColumn(1), ColumnName("Label")]
        //[LoadColumn(1)]
        public bool Sentiment;

    }

    public class SentimentPrediction
    {
        [ColumnName("PredictedLabel")]
        public bool Prediction { get; set; }

       // [ColumnName("Probability")]
        public float Probability { get; set; }

      //  [ColumnName("Score")]
        public float Score { get; set; }
    }
    // </SnippetDeclareTypes>
}

GitHubIssueData.cs

// <SnippetAddUsings>
using Microsoft.ML.Data;
// </SnippetAddUsings>

namespace MLmultiPg
{
    // <SnippetDeclareTypes>
    public class GitHubIssue
    {
        [LoadColumn(0)]
        public string ID { get; set; }
        [LoadColumn(1)]
        public string Area { get; set; }
        [LoadColumn(2)]
        public string Title { get; set; }
        [LoadColumn(3)]
        public string Description { get; set; }
    }

    public class IssuePrediction
    {
        [ColumnName("PredictedLabel")]
        public string Area;
    }
    // </SnippetDeclareTypes>
}
natke commented 5 years ago

Hi @Angusonly, can you please ask this product support question in the gitter channel: https://gitter.im/dotnet/mlnet, or stackoverflow: https://stackoverflow.com/questions/tagged/ml.net