Esempio n. 1
0
        private static void Train(string baseName, string dataset, uint epoch, double learningRate, double minLearningRate, uint miniBatchSize, uint validation, bool useMean)
        {
            try
            {
                IList <Matrix <RgbPixel> > trainingImages;
                IList <uint> trainingLabels;
                IList <Matrix <RgbPixel> > testingImages;
                IList <uint> testingLabels;

                var mean = useMean ? Path.Combine(dataset, "train.mean.bmp") : null;

                Console.WriteLine("Start load train images");
                Load("train", dataset, mean, out trainingImages, out trainingLabels);
                Console.WriteLine($"Load train images: {trainingImages.Count}");

                Console.WriteLine("Start load test images");
                Load("test", dataset, mean, out testingImages, out testingLabels);
                Console.WriteLine($"Load test images: {testingImages.Count}");

                // So with that out of the way, we can make a network instance.
                var trainNet  = NativeMethods.LossMulticlassLog_age_train_type_create();
                var networkId = LossMulticlassLogRegistry.GetId(trainNet);
                LossMulticlassLogRegistry.Add(trainNet);

                using (var net = new LossMulticlassLog(networkId))
                    using (var trainer = new DnnTrainer <LossMulticlassLog>(net))
                    {
                        trainer.SetLearningRate(learningRate);
                        trainer.SetMinLearningRate(minLearningRate);
                        trainer.SetMiniBatchSize(miniBatchSize);
                        trainer.BeVerbose();
                        trainer.SetSynchronizationFile(baseName, 180);

                        // create array box
                        var trainingImagesCount = trainingImages.Count;
                        var trainingLabelsCount = trainingLabels.Count;

                        var maxIteration = (int)Math.Ceiling(trainingImagesCount / (float)miniBatchSize);
                        var imageBatches = new Matrix <RgbPixel> [maxIteration][];
                        var labelBatches = new uint[maxIteration][];
                        for (var i = 0; i < maxIteration; i++)
                        {
                            if (miniBatchSize <= trainingImagesCount - i * miniBatchSize)
                            {
                                imageBatches[i] = new Matrix <RgbPixel> [miniBatchSize];
                                labelBatches[i] = new uint[miniBatchSize];
                            }
                            else
                            {
                                imageBatches[i] = new Matrix <RgbPixel> [trainingImagesCount % miniBatchSize];
                                labelBatches[i] = new uint[trainingLabelsCount % miniBatchSize];
                            }
                        }

                        using (var fs = new FileStream($"{baseName}.log", FileMode.Create, FileAccess.Write, FileShare.Write))
                            using (var sw = new StreamWriter(fs, Encoding.UTF8))
                                for (var e = 0; e < epoch; e++)
                                {
                                    var randomArray = Enumerable.Range(0, trainingImagesCount).OrderBy(i => Guid.NewGuid()).ToArray();
                                    var index       = 0;
                                    for (var i = 0; i < imageBatches.Length; i++)
                                    {
                                        var currentImages = imageBatches[i];
                                        var currentLabels = labelBatches[i];
                                        for (var j = 0; j < imageBatches[i].Length; j++)
                                        {
                                            var rIndex = randomArray[index];
                                            currentImages[j] = trainingImages[rIndex];
                                            currentLabels[j] = trainingLabels[rIndex];
                                            index++;
                                        }
                                    }

                                    for (var i = 0; i < maxIteration; i++)
                                    {
                                        LossMulticlassLog.TrainOneStep(trainer, imageBatches[i], labelBatches[i]);
                                    }

                                    var lr   = trainer.GetLearningRate();
                                    var loss = trainer.GetAverageLoss();

                                    var trainLog = $"Epoch: {e}, learning Rate: {lr}, average loss: {loss}";
                                    Console.WriteLine(trainLog);
                                    sw.WriteLine(trainLog);

                                    if (e > 0 && e % validation == 0)
                                    {
                                        Validation(baseName, net, trainingImages, trainingLabels, testingImages, testingLabels, false, false, out var trainAccuracy, out var testAccuracy);

                                        var validationLog = $"Epoch: {e}, train accuracy: {trainAccuracy}, test accuracy: {testAccuracy}";
                                        Console.WriteLine(validationLog);
                                        sw.WriteLine(validationLog);
                                    }

                                    if (lr < minLearningRate)
                                    {
                                        break;
                                    }
                                }

                        // wait for training threads to stop
                        trainer.GetNet();
                        Console.WriteLine("done training");

                        net.Clean();
                        LossMulticlassLog.Serialize(net, $"{baseName}.dat");

                        // Now let's run the training images through the network.  This statement runs all the
                        // images through it and asks the loss layer to convert the network's raw output into
                        // labels.  In our case, these labels are the numbers between 0 and 9.
                        Validation(baseName, net, trainingImages, trainingLabels, testingImages, testingLabels, true, true, out _, out _);
                    }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }
        }
Esempio n. 2
0
        private static int Main(string[] args)
        {
            try
            {
                if (args.Length != 1)
                {
                    Console.WriteLine("To run this program you need a copy of the PASCAL VOC2012 dataset.");
                    Console.WriteLine();
                    Console.WriteLine("You call this program like this: ");
                    Console.WriteLine("./dnn_semantic_segmentation_train_ex /path/to/VOC2012");
                    return(1);
                }

                Console.WriteLine("\nSCANNING PASCAL VOC2012 DATASET\n");

                var listing = PascalVOC2012.GetPascalVoc2012TrainListing(args[0]).ToArray();
                Console.WriteLine($"images in dataset: {listing.Length}");
                if (listing.Length == 0)
                {
                    Console.WriteLine("Didn't find the VOC2012 dataset.");
                    return(1);
                }

                const double initialLearningRate = 0.1;
                const double weightDecay         = 0.0001;
                const double momentum            = 0.9;

                using (var net = new LossMulticlassLogPerPixel(0))
                    using (var sgd = new Sgd((float)weightDecay, (float)momentum))
                        using (var trainer = new DnnTrainer <LossMulticlassLogPerPixel>(net, sgd))
                        {
                            trainer.BeVerbose();
                            trainer.SetLearningRate(initialLearningRate);
                            trainer.SetSynchronizationFile("pascal_voc2012_trainer_state_file.dat", 10 * 60);
                            // This threshold is probably excessively large.
                            trainer.SetIterationsWithoutProgressThreshold(5000);
                            // Since the progress threshold is so large might as well set the batch normalization
                            // stats window to something big too.
                            Dlib.SetAllBnRunningStatsWindowSizes(net, 1000);

                            // Output training parameters.
                            Console.WriteLine();
                            Console.WriteLine(trainer);

                            var samples = new List <Matrix <RgbPixel> >();
                            var labels  = new List <Matrix <ushort> >();

                            //// Start a bunch of threads that read images from disk and pull out random crops.  It's
                            //// important to be sure to feed the GPU fast enough to keep it busy.  Using multiple
                            //// thread for this kind of data preparation helps us do that.  Each thread puts the
                            //// crops into the data queue.
                            using (var data = new Pipe <TrainingSample>(200))
                            {
                                var function = new Action <object>(seed =>
                                {
                                    using (var rnd = new Rand((ulong)seed))
                                    {
                                        while (data.IsEnabled)
                                        {
                                            // Pick a random input image.
                                            var imageInfo = listing[rnd.GetRandom32BitNumber() % listing.Length];

                                            // Load the input image.
                                            using (var inputImage = Dlib.LoadImageAsMatrix <RgbPixel>(imageInfo.ImageFilename))
                                            {
                                                // Load the ground-truth (RGB) labels.
                                                using (var rgbLabelImage = Dlib.LoadImageAsMatrix <RgbPixel>(imageInfo.ClassLabelFilename))
                                                {
                                                    // Convert the indexes to RGB values.
                                                    using (var indexLabelImage = new Matrix <ushort>())
                                                    {
                                                        PascalVOC2012.RgbLabelImageToIndexLabelImage(rgbLabelImage, indexLabelImage);

                                                        // Randomly pick a part of the image.
                                                        var temp = new TrainingSample();
                                                        RandomlyCropImage(inputImage, indexLabelImage, temp, rnd);

                                                        // Push the result to be used by the trainer.
                                                        data.Enqueue(temp);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                });

                                var threads = Enumerable.Range(1, 1).Select(i =>
                                {
                                    var dataLoader = new Thread(new ParameterizedThreadStart(function))
                                    {
                                        Name = $"dataLoader{i}"
                                    };
                                    dataLoader.Start((ulong)i);
                                    return(dataLoader);
                                }).ToArray();

                                // The main training loop.  Keep making mini-batches and giving them to the trainer.
                                // We will run until the learning rate has dropped by a factor of 1e-4.
                                while (trainer.GetLearningRate() >= 1e-4)
                                {
                                    samples.DisposeElement();
                                    labels.DisposeElement();
                                    samples.Clear();
                                    labels.Clear();

                                    // make a 30-image mini-batch
                                    while (samples.Count < 30)
                                    {
                                        data.Dequeue(out var temp);

                                        samples.Add(temp.InputImage);
                                        labels.Add(temp.LabelImage);

                                        temp.Dispose();
                                    }

                                    LossMulticlassLogPerPixel.TrainOneStep(trainer, samples, labels);
                                }

                                // Training done, tell threads to stop and make sure to wait for them to finish before
                                // moving on.
                                data.Disable();
                                foreach (var thread in threads)
                                {
                                    thread.Join();
                                }

                                // also wait for threaded processing to stop in the trainer.
                                trainer.GetNet();

                                net.Clean();
                                Console.WriteLine("saving network");
                                LossMulticlassLogPerPixel.Serialize(net, "semantic_segmentation_voc2012net.dnn");
                            }

                            // Make a copy of the network to use it for inference.
                            using (var anet = net.CloneAs(1))
                            {
                                Console.WriteLine("Testing the network...");

                                // Find the accuracy of the newly trained network on both the training and the validation sets.
                                Console.WriteLine($"train accuracy  :  {CalculateAccuracy(anet, PascalVOC2012.GetPascalVoc2012TrainListing(args[0]))}");
                                Console.WriteLine($"val accuracy    :  {CalculateAccuracy(anet, PascalVOC2012.GetPascalVoc2012ValListing(args[0]))}");
                            }
                        }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                return(1);
            }

            return(0);
        }
Esempio n. 3
0
        private void Train(Parameter parameter)
        {
            try
            {
                IList <Matrix <C> > trainingImages;
                IList <T>           trainingLabels;
                IList <Matrix <C> > testingImages;
                IList <T>           testingLabels;

                Logger.Info("Start load train images");
                Load(parameter.Dataset, "train", out trainingImages, out trainingLabels);
                Logger.Info($"Load train images: {trainingImages.Count}");

                Logger.Info("Start load test images");
                Load(parameter.Dataset, "test", out testingImages, out testingLabels);
                Logger.Info($"Load test images: {testingImages.Count}");
                Logger.Info("");

                // So with that out of the way, we can make a network instance.
                var networkId = SetupNetwork();

                using (var net = new LossMulticlassLog(networkId))
                    using (var solver = new Adam())
                        using (var trainer = new DnnTrainer <LossMulticlassLog>(net, solver))
                        {
                            var learningRate    = parameter.LearningRate;
                            var minLearningRate = parameter.MinLearningRate;
                            var miniBatchSize   = parameter.MiniBatchSize;
                            var baseName        = parameter.BaseName;
                            var epoch           = parameter.Epoch;
                            var validation      = parameter.Validation;

                            trainer.SetLearningRate(learningRate);
                            trainer.SetMinLearningRate(minLearningRate);
                            trainer.SetMiniBatchSize(miniBatchSize);
                            trainer.BeVerbose();
                            trainer.SetSynchronizationFile(baseName, 180);

                            // create array box
                            var trainingImagesCount = trainingImages.Count;
                            var trainingLabelsCount = trainingLabels.Count;

                            var maxIteration = (int)Math.Ceiling(trainingImagesCount / (float)miniBatchSize);
                            var imageBatches = new Matrix <C> [maxIteration][];
                            var labelBatches = new uint[maxIteration][];
                            for (var i = 0; i < maxIteration; i++)
                            {
                                if (miniBatchSize <= trainingImagesCount - i * miniBatchSize)
                                {
                                    imageBatches[i] = new Matrix <C> [miniBatchSize];
                                    labelBatches[i] = new uint[miniBatchSize];
                                }
                                else
                                {
                                    imageBatches[i] = new Matrix <C> [trainingImagesCount % miniBatchSize];
                                    labelBatches[i] = new uint[trainingLabelsCount % miniBatchSize];
                                }
                            }

                            using (var fs = new FileStream($"{baseName}.log", FileMode.Create, FileAccess.Write, FileShare.Write))
                                using (var sw = new StreamWriter(fs, Encoding.UTF8))
                                    for (var e = 0; e < epoch; e++)
                                    {
                                        var randomArray = Enumerable.Range(0, trainingImagesCount).OrderBy(i => Guid.NewGuid()).ToArray();
                                        var index       = 0;
                                        for (var i = 0; i < imageBatches.Length; i++)
                                        {
                                            var currentImages = imageBatches[i];
                                            var currentLabels = labelBatches[i];
                                            for (var j = 0; j < imageBatches[i].Length; j++)
                                            {
                                                var rIndex = randomArray[index];
                                                currentImages[j] = trainingImages[rIndex];
                                                currentLabels[j] = this.Cast(trainingLabels[rIndex]);
                                                index++;
                                            }
                                        }

                                        for (var i = 0; i < maxIteration; i++)
                                        {
                                            LossMulticlassLog.TrainOneStep(trainer, imageBatches[i], labelBatches[i]);
                                        }

                                        var lr   = trainer.GetLearningRate();
                                        var loss = trainer.GetAverageLoss();

                                        var trainLog = $"Epoch: {e}, learning Rate: {lr}, average loss: {loss}";
                                        Logger.Info(trainLog);
                                        sw.WriteLine(trainLog);

                                        if (e >= 0 && e % validation == 0)
                                        {
                                            var validationParameter = new ValidationParameter <T, C>
                                            {
                                                BaseName       = parameter.BaseName,
                                                Output         = parameter.Output,
                                                Trainer        = net,
                                                TrainingImages = trainingImages,
                                                TrainingLabels = trainingLabels,
                                                TestingImages  = testingImages,
                                                TestingLabels  = testingLabels,
                                                UseConsole     = true,
                                                SaveToXml      = true,
                                                OutputDiffLog  = true
                                            };

                                            Validation(validationParameter, out var trainAccuracy, out var testAccuracy);

                                            var validationLog = $"Epoch: {e}, train accuracy: {trainAccuracy}, test accuracy: {testAccuracy}";
                                            Logger.Info(validationLog);
                                            sw.WriteLine(validationLog);

                                            var name = this.GetBaseName(parameter.Epoch,
                                                                        parameter.LearningRate,
                                                                        parameter.MinLearningRate,
                                                                        parameter.MiniBatchSize);

                                            UpdateBestModelFile(net, testAccuracy, parameter.Output, name, "test");
                                            UpdateBestModelFile(net, trainAccuracy, parameter.Output, name, "train");
                                        }

                                        if (lr < minLearningRate)
                                        {
                                            Logger.Info($"Stop training: {lr} < {minLearningRate}");
                                            break;
                                        }
                                    }

                            // wait for training threads to stop
                            trainer.GetNet();
                            Logger.Info("done training");

                            net.Clean();
                            LossMulticlassLog.Serialize(net, $"{baseName}.tmp");

                            // Now let's run the training images through the network.  This statement runs all the
                            // images through it and asks the loss layer to convert the network's raw output into
                            // labels.  In our case, these labels are the numbers between 0 and 9.
                            var validationParameter2 = new ValidationParameter <T, C>
                            {
                                BaseName       = parameter.BaseName,
                                Output         = parameter.Output,
                                Trainer        = net,
                                TrainingImages = trainingImages,
                                TrainingLabels = trainingLabels,
                                TestingImages  = testingImages,
                                TestingLabels  = testingLabels,
                                UseConsole     = true,
                                SaveToXml      = true,
                                OutputDiffLog  = true
                            };

                            Validation(validationParameter2, out _, out _);

                            // clean up tmp files
                            Clean(parameter.Output);
                        }
            }
            catch (Exception e)
            {
                Logger.Error(e.Message);
            }
        }
Esempio n. 4
0
        private static void Main()
        {
            try
            {
                // The API for doing metric learning is very similar to the API for
                // multi-class classification.  In fact, the inputs are the same, a bunch of
                // labeled objects.  So here we create our dataset.  We make up some simple
                // vectors and label them with the integers 1,2,3,4.  The specific values of
                // the integer labels don't matter.
                var samples = new List <Matrix <double> >();
                var labels  = new List <uint>();

                // class 1 training vectors
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 1, 0, 0, 0, 0, 0, 0, 0 })); labels.Add(1);
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 0, 1, 0, 0, 0, 0, 0, 0 })); labels.Add(1);

                // class 2 training vectors
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 0, 0, 1, 0, 0, 0, 0, 0 })); labels.Add(2);
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 0, 0, 0, 1, 0, 0, 0, 0 })); labels.Add(2);

                // class 3 training vectors
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 0, 0, 0, 0, 1, 0, 0, 0 })); labels.Add(3);
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 0, 0, 0, 0, 0, 1, 0, 0 })); labels.Add(3);

                // class 4 training vectors
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 0, 0, 0, 0, 0, 0, 1, 0 })); labels.Add(4);
                samples.Add(new Matrix <double>(new MatrixTemplateSizeParameter(0, 1), new double[] { 0, 0, 0, 0, 0, 0, 0, 1 })); labels.Add(4);


                // Make a network that simply learns a linear mapping from 8D vectors to 2D
                // vectors.
                using (var net = new LossMetric(1))
                    using (var trainer = new DnnTrainer <LossMetric>(net))
                    {
                        trainer.SetLearningRate(0.1);

                        // It should be emphasized out that it's really important that each mini-batch contain
                        // multiple instances of each class of object.  This is because the metric learning
                        // algorithm needs to consider pairs of objects that should be close as well as pairs
                        // of objects that should be far apart during each training step.  Here we just keep
                        // training on the same small batch so this constraint is trivially satisfied.
                        while (trainer.GetLearningRate() >= 1e-4)
                        {
                            LossMetric.TrainOneStep(trainer, samples, labels);
                        }

                        // Wait for training threads to stop
                        trainer.GetNet().Dispose();
                        Console.WriteLine("done training");


                        // Run all the samples through the network to get their 2D vector embeddings.
                        var embedded = net.Operator(samples);

                        // Print the embedding for each sample to the screen.  If you look at the
                        // outputs carefully you should notice that they are grouped together in 2D
                        // space according to their label.
                        for (var i = 0; i < embedded.Count(); ++i)
                        {
                            using (var trans = Dlib.Trans(embedded[i]))
                                Console.Write($"label: {labels[i]}\t{trans}");
                        }

                        // Now, check if the embedding puts things with the same labels near each other and
                        // things with different labels far apart.
                        var numRight = 0;
                        var numWrong = 0;
                        for (var i = 0; i < embedded.Count(); ++i)
                        {
                            for (var j = i + 1; j < embedded.Count(); ++j)
                            {
                                if (labels[i] == labels[j])
                                {
                                    // The loss_metric layer will cause things with the same label to be less
                                    // than net.loss_details().get_distance_threshold() distance from each
                                    // other.  So we can use that distance value as our testing threshold for
                                    // "being near to each other".
                                    if (Dlib.Length(embedded[i] - embedded[j]) < net.GetLossDetails().GetDistanceThreshold())
                                    {
                                        ++numRight;
                                    }
                                    else
                                    {
                                        ++numWrong;
                                    }
                                }
                                else
                                {
                                    if (Dlib.Length(embedded[i] - embedded[j]) >= net.GetLossDetails().GetDistanceThreshold())
                                    {
                                        ++numRight;
                                    }
                                    else
                                    {
                                        ++numWrong;
                                    }
                                }
                            }
                        }

                        Console.WriteLine($"num_right: {numRight}");
                        Console.WriteLine($"num_wrong: {numWrong}");
                    }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }
        }
Esempio n. 5
0
        private static void Main(string[] args)
        {
            try
            {
                if (args.Length != 1)
                {
                    Console.WriteLine("Give the path to a folder containing training.xml and testing.xml files.");
                    Console.WriteLine("This example program is specifically designed to run on the dlib vehicle ");
                    Console.WriteLine("detection dataset, which is available at this URL: ");
                    Console.WriteLine("   http://dlib.net/files/data/dlib_rear_end_vehicles_v1.tar");
                    Console.WriteLine();
                    Console.WriteLine("So download that dataset, extract it somewhere, and then run this program");
                    Console.WriteLine("with the dlib_rear_end_vehicles folder as an argument.  E.g. if you extract");
                    Console.WriteLine("the dataset to the current folder then you should run this example program");
                    Console.WriteLine("by typing: ");
                    Console.WriteLine("   ./dnn_mmod_train_find_cars_ex dlib_rear_end_vehicles");
                    Console.WriteLine();
                    Console.WriteLine("It takes about a day to finish if run on a high end GPU like a 1080ti.");
                    Console.WriteLine();
                    return;
                }

                var dataDirectory = args[0];

                IList <Matrix <RgbPixel> > imagesTrain;
                IList <Matrix <RgbPixel> > imagesTest;
                IList <IList <MModRect> >  boxesTrain;
                IList <IList <MModRect> >  boxesTest;
                Dlib.LoadImageDataset(dataDirectory + "/training.xml", out imagesTrain, out boxesTrain);
                Dlib.LoadImageDataset(dataDirectory + "/testing.xml", out imagesTest, out boxesTest);

                // When I was creating the dlib vehicle detection dataset I had to label all the cars
                // in each image.  MMOD requires all cars to be labeled, since any unlabeled part of an
                // image is implicitly assumed to be not a car, and the algorithm will use it as
                // negative training data.  So every car must be labeled, either with a normal
                // rectangle or an "ignore" rectangle that tells MMOD to simply ignore it (i.e. neither
                // treat it as a thing to detect nor as negative training data).
                //
                // In our present case, many images contain very tiny cars in the distance, ones that
                // are essentially just dark smudges.  It's not reasonable to expect the CNN
                // architecture we defined to detect such vehicles.  However, I erred on the side of
                // having more complete annotations when creating the dataset.  So when I labeled these
                // images I labeled many of these really difficult cases as vehicles to detect.
                //
                // So the first thing we are going to do is clean up our dataset a little bit.  In
                // particular, we are going to mark boxes smaller than 35*35 pixels as ignore since
                // only really small and blurry cars appear at those sizes.  We will also mark boxes
                // that are heavily overlapped by another box as ignore.  We do this because we want to
                // allow for stronger non-maximum suppression logic in the learned detector, since that
                // will help make it easier to learn a good detector.
                //
                // To explain this non-max suppression idea further it's important to understand how
                // the detector works.  Essentially, sliding window detectors scan all image locations
                // and ask "is there a car here?".  If there really is a car in a specific location in
                // an image then usually many slightly different sliding window locations will produce
                // high detection scores, indicating that there is a car at those locations.  If we
                // just stopped there then each car would produce multiple detections.  But that isn't
                // what we want.  We want each car to produce just one detection.  So it's common for
                // detectors to include "non-maximum suppression" logic which simply takes the
                // strongest detection and then deletes all detections "close to" the strongest.  This
                // is a simple post-processing step that can eliminate duplicate detections.  However,
                // we have to define what "close to" means.  We can do this by looking at your training
                // data and checking how close the closest target boxes are to each other, and then
                // picking a "close to" measure that doesn't suppress those target boxes but is
                // otherwise as tight as possible.  This is exactly what the mmod_options object does
                // by default.
                //
                // Importantly, this means that if your training dataset contains an image with two
                // target boxes that really overlap a whole lot, then the non-maximum suppression
                // "close to" measure will be configured to allow detections to really overlap a whole
                // lot.  On the other hand, if your dataset didn't contain any overlapped boxes at all,
                // then the non-max suppression logic would be configured to filter out any boxes that
                // overlapped at all, and thus would be performing a much stronger non-max suppression.
                //
                // Why does this matter?  Well, remember that we want to avoid duplicate detections.
                // If non-max suppression just kills everything in a really wide area around a car then
                // the CNN doesn't really need to learn anything about avoiding duplicate detections.
                // However, if non-max suppression only suppresses a tiny area around each detection
                // then the CNN will need to learn to output small detection scores for those areas of
                // the image not suppressed.  The smaller the non-max suppression region the more the
                // CNN has to learn and the more difficult the learning problem will become.  This is
                // why we remove highly overlapped objects from the training dataset.  That is, we do
                // it so the non-max suppression logic will be able to be reasonably effective.  Here
                // we are ensuring that any boxes that are entirely contained by another are
                // suppressed.  We also ensure that boxes with an intersection over union of 0.5 or
                // greater are suppressed.  This will improve the resulting detector since it will be
                // able to use more aggressive non-max suppression settings.

                var numOverlappedIgnoredTest = 0;
                foreach (var v in boxesTest)
                {
                    using (var overlap = new TestBoxOverlap(0.50, 0.95))
                        numOverlappedIgnoredTest += IgnoreOverlappedBoxes(v, overlap);
                }

                var numOverlappedIgnored = 0;
                var numAdditionalIgnored = 0;

                foreach (var v in boxesTrain)
                {
                    using (var overlap = new TestBoxOverlap(0.50, 0.95))
                        numOverlappedIgnored += IgnoreOverlappedBoxes(v, overlap);
                    foreach (var bb in v)
                    {
                        if (bb.Rect.Width < 35 && bb.Rect.Height < 35)
                        {
                            if (!bb.Ignore)
                            {
                                bb.Ignore = true;
                                ++numAdditionalIgnored;
                            }
                        }

                        // The dlib vehicle detection dataset doesn't contain any detections with
                        // really extreme aspect ratios.  However, some datasets do, often because of
                        // bad labeling.  So it's a good idea to check for that and either eliminate
                        // those boxes or set them to ignore.  Although, this depends on your
                        // application.
                        //
                        // For instance, if your dataset has boxes with an aspect ratio
                        // of 10 then you should think about what that means for the network
                        // architecture.  Does the receptive field even cover the entirety of the box
                        // in those cases?  Do you care about these boxes?  Are they labeling errors?
                        // I find that many people will download some dataset from the internet and
                        // just take it as given.  They run it through some training algorithm and take
                        // the dataset as unchallengeable truth.  But many datasets are full of
                        // labeling errors.  There are also a lot of datasets that aren't full of
                        // errors, but are annotated in a sloppy and inconsistent way.  Fixing those
                        // errors and inconsistencies can often greatly improve models trained from
                        // such data.  It's almost always worth the time to try and improve your
                        // training dataset.
                        //
                        // In any case, my point is that there are other types of dataset cleaning you
                        // could put here.  What exactly you need depends on your application.  But you
                        // should carefully consider it and not take your dataset as a given.  The work
                        // of creating a good detector is largely about creating a high quality
                        // training dataset.
                    }
                }

                // When modifying a dataset like this, it's a really good idea to print a log of how
                // many boxes you ignored.  It's easy to accidentally ignore a huge block of data, so
                // you should always look and see that things are doing what you expect.
                Console.WriteLine($"num_overlapped_ignored: {numOverlappedIgnored}");
                Console.WriteLine($"num_additional_ignored: {numAdditionalIgnored}");
                Console.WriteLine($"num_overlapped_ignored_test: {numOverlappedIgnoredTest}");


                Console.WriteLine($"num training images: {imagesTrain.Count()}");
                Console.WriteLine($"num testing images: {imagesTest.Count()}");


                // Our vehicle detection dataset has basically 3 different types of boxes.  Square
                // boxes, tall and skinny boxes (e.g. semi trucks), and short and wide boxes (e.g.
                // sedans).  Here we are telling the MMOD algorithm that a vehicle is recognizable as
                // long as the longest box side is at least 70 pixels long and the shortest box side is
                // at least 30 pixels long.  mmod_options will use these parameters to decide how large
                // each of the sliding windows needs to be so as to be able to detect all the vehicles.
                // Since our dataset has basically these 3 different aspect ratios, it will decide to
                // use 3 different sliding windows.  This means the final con layer in the network will
                // have 3 filters, one for each of these aspect ratios.
                //
                // Another thing to consider when setting the sliding window size is the "stride" of
                // your network.  The network we defined above downsamples the image by a factor of 8x
                // in the first few layers.  So when the sliding windows are scanning the image, they
                // are stepping over it with a stride of 8 pixels.  If you set the sliding window size
                // too small then the stride will become an issue.  For instance, if you set the
                // sliding window size to 4 pixels, then it means a 4x4 window will be moved by 8
                // pixels at a time when scanning. This is obviously a problem since 75% of the image
                // won't even be visited by the sliding window.  So you need to set the window size to
                // be big enough relative to the stride of your network.  In our case, the windows are
                // at least 30 pixels in length, so being moved by 8 pixel steps is fine.
                using (var options = new MModOptions(boxesTrain, 70, 30))
                {
                    // This setting is very important and dataset specific.  The vehicle detection dataset
                    // contains boxes that are marked as "ignore", as we discussed above.  Some of them are
                    // ignored because we set ignore to true in the above code.  However, the xml files
                    // also contained a lot of ignore boxes.  Some of them are large boxes that encompass
                    // large parts of an image and the intention is to have everything inside those boxes
                    // be ignored.  Therefore, we need to tell the MMOD algorithm to do that, which we do
                    // by setting options.overlaps_ignore appropriately.
                    //
                    // But first, we need to understand exactly what this option does.  The MMOD loss
                    // is essentially counting the number of false alarms + missed detections produced by
                    // the detector for each image.  During training, the code is running the detector on
                    // each image in a mini-batch and looking at its output and counting the number of
                    // mistakes.  The optimizer tries to find parameters settings that minimize the number
                    // of detector mistakes.
                    //
                    // This overlaps_ignore option allows you to tell the loss that some outputs from the
                    // detector should be totally ignored, as if they never happened.  In particular, if a
                    // detection overlaps a box in the training data with ignore==true then that detection
                    // is ignored.  This overlap is determined by calling
                    // options.overlaps_ignore(the_detection, the_ignored_training_box).  If it returns
                    // true then that detection is ignored.
                    //
                    // You should read the documentation for test_box_overlap, the class type for
                    // overlaps_ignore for full details.  However, the gist is that the default behavior is
                    // to only consider boxes as overlapping if their intersection over union is > 0.5.
                    // However, the dlib vehicle detection dataset contains large boxes that are meant to
                    // mask out large areas of an image.  So intersection over union isn't an appropriate
                    // way to measure "overlaps with box" in this case.  We want any box that is contained
                    // inside one of these big regions to be ignored, even if the detection box is really
                    // small.  So we set overlaps_ignore to behave that way with this line.
                    options.OverlapsIgnore = new TestBoxOverlap(0.5, 0.95);

                    using (var net = new LossMmod(options, 3))
                    {
                        // The final layer of the network must be a con layer that contains
                        // options.detector_windows.size() filters.  This is because these final filters are
                        // what perform the final "sliding window" detection in the network.  For the dlib
                        // vehicle dataset, there will be 3 sliding window detectors, so we will be setting
                        // num_filters to 3 here.
                        var detectorWindows = options.DetectorWindows.ToArray();
                        using (var subnet = net.GetSubnet())
                            using (var details = subnet.GetLayerDetails())
                            {
                                details.SetNumFilters(detectorWindows.Length);

                                using (var trainer = new DnnTrainer <LossMmod>(net))
                                {
                                    trainer.SetLearningRate(0.1);
                                    trainer.BeVerbose();


                                    // While training, we are going to use early stopping.  That is, we will be checking
                                    // how good the detector is performing on our test data and when it stops getting
                                    // better on the test data we will drop the learning rate.  We will keep doing that
                                    // until the learning rate is less than 1e-4.   These two settings tell the trainer to
                                    // do that.  Essentially, we are setting the first argument to infinity, and only the
                                    // test iterations without progress threshold will matter.  In particular, it says that
                                    // once we observe 1000 testing mini-batches where the test loss clearly isn't
                                    // decreasing we will lower the learning rate.
                                    trainer.SetIterationsWithoutProgressThreshold(50000);
                                    trainer.SetTestIterationsWithoutProgressThreshold(1000);

                                    const string syncFilename = "mmod_cars_sync";
                                    trainer.SetSynchronizationFile(syncFilename, 5 * 60);



                                    IEnumerable <Matrix <RgbPixel> >      mini_batch_samples;
                                    IEnumerable <IEnumerable <MModRect> > mini_batch_labels;
                                    using (var cropper = new RandomCropper())
                                    {
                                        cropper.SetSeed(0);
                                        cropper.SetChipDims(350, 350);
                                        // Usually you want to give the cropper whatever min sizes you passed to the
                                        // mmod_options constructor, or very slightly smaller sizes, which is what we do here.
                                        cropper.SetMinObjectSize(69, 28);
                                        cropper.MaxRotationDegrees = 2;

                                        using (var rnd = new Rand())
                                        {
                                            // Log the training parameters to the console
                                            Console.WriteLine($"{trainer}{cropper}");

                                            var cnt = 1;
                                            // Run the trainer until the learning rate gets small.
                                            while (trainer.GetLearningRate() >= 1e-4)
                                            {
                                                // Every 30 mini-batches we do a testing mini-batch.
                                                if (cnt % 30 != 0 || !imagesTest.Any())
                                                {
                                                    cropper.Operator(87, imagesTrain, boxesTrain, out mini_batch_samples, out mini_batch_labels);
                                                    // We can also randomly jitter the colors and that often helps a detector
                                                    // generalize better to new images.
                                                    foreach (var img in mini_batch_samples)
                                                    {
                                                        Dlib.DisturbColors(img, rnd);
                                                    }

                                                    // It's a good idea to, at least once, put code here that displays the images
                                                    // and boxes the random cropper is generating.  You should look at them and
                                                    // think about if the output makes sense for your problem.  Most of the time
                                                    // it will be fine, but sometimes you will realize that the pattern of cropping
                                                    // isn't really appropriate for your problem and you will need to make some
                                                    // change to how the mini-batches are being generated.  Maybe you will tweak
                                                    // some of the cropper's settings, or write your own entirely separate code to
                                                    // create mini-batches.  But either way, if you don't look you will never know.
                                                    // An easy way to do this is to create a dlib::image_window to display the
                                                    // images and boxes.

                                                    LossMmod.TrainOneStep(trainer, mini_batch_samples, mini_batch_labels);

                                                    mini_batch_samples.DisposeElement();
                                                    mini_batch_labels.DisposeElement();
                                                }
                                                else
                                                {
                                                    cropper.Operator(87, imagesTest, boxesTest, out mini_batch_samples, out mini_batch_labels);
                                                    // We can also randomly jitter the colors and that often helps a detector
                                                    // generalize better to new images.
                                                    foreach (var img in mini_batch_samples)
                                                    {
                                                        Dlib.DisturbColors(img, rnd);
                                                    }

                                                    LossMmod.TestOneStep(trainer, mini_batch_samples, mini_batch_labels);

                                                    mini_batch_samples.DisposeElement();
                                                    mini_batch_labels.DisposeElement();
                                                }
                                                ++cnt;
                                            }
                                            // wait for training threads to stop
                                            trainer.GetNet();
                                            Console.WriteLine("done training");

                                            // Save the network to disk
                                            net.Clean();
                                            LossMmod.Serialize(net, "mmod_rear_end_vehicle_detector.dat");


                                            // It's a really good idea to print the training parameters.  This is because you will
                                            // invariably be running multiple rounds of training and should be logging the output
                                            // to a file.  This print statement will include many of the training parameters in
                                            // your log.
                                            Console.WriteLine($"{trainer}{cropper}");

                                            Console.WriteLine($"\nsync_filename: {syncFilename}");
                                            Console.WriteLine($"num training images: {imagesTrain.Count()}");
                                            using (var _ = new TestBoxOverlap())
                                                using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTrain, boxesTrain, _, 0, options.OverlapsIgnore))
                                                    Console.WriteLine($"training results: {matrix}");
                                            // Upsampling the data will allow the detector to find smaller cars.  Recall that
                                            // we configured it to use a sliding window nominally 70 pixels in size.  So upsampling
                                            // here will let it find things nominally 35 pixels in size.  Although we include a
                                            // limit of 1800*1800 here which means "don't upsample an image if it's already larger
                                            // than 1800*1800".  We do this so we don't run out of RAM, which is a concern because
                                            // some of the images in the dlib vehicle dataset are really high resolution.
                                            Dlib.UpsampleImageDataset(2, imagesTrain, boxesTrain, 1800 * 1800);
                                            using (var _ = new TestBoxOverlap())
                                                using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTrain, boxesTrain, _, 0, options.OverlapsIgnore))
                                                    Console.WriteLine($"training upsampled results: {matrix}");


                                            Console.WriteLine("num testing images: {images_test.Count()}");
                                            using (var _ = new TestBoxOverlap())
                                                using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTest, boxesTest, _, 0, options.OverlapsIgnore))
                                                    Console.WriteLine($"testing results: {matrix}");
                                            Dlib.UpsampleImageDataset(2, imagesTest, boxesTest, 1800 * 1800);
                                            using (var _ = new TestBoxOverlap())
                                                using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTest, boxesTest, _, 0, options.OverlapsIgnore))
                                                    Console.WriteLine($"testing upsampled results: {matrix}");

                                            /*
                                             *  This program takes many hours to execute on a high end GPU.  It took about a day to
                                             *  train on a NVIDIA 1080ti.  The resulting model file is available at
                                             *      http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
                                             *  It should be noted that this file on dlib.net has a dlib::shape_predictor appended
                                             *  onto the end of it (see dnn_mmod_find_cars_ex.cpp for an example of its use).  This
                                             *  explains why the model file on dlib.net is larger than the
                                             *  mmod_rear_end_vehicle_detector.dat output by this program.
                                             *
                                             *  You can see some videos of this vehicle detector running on YouTube:
                                             *      https://www.youtube.com/watch?v=4B3bzmxMAZU
                                             *      https://www.youtube.com/watch?v=bP2SUo5vSlc
                                             *
                                             *  Also, the training and testing accuracies were:
                                             *      num training images: 2217
                                             *      training results: 0.990738 0.736431 0.736073
                                             *      training upsampled results: 0.986837 0.937694 0.936912
                                             *      num testing images: 135
                                             *      testing results: 0.988827 0.471372 0.470806
                                             *      testing upsampled results: 0.987879 0.651132 0.650399
                                             */
                                        }
                                    }
                                }
                            }
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }
        }
Esempio n. 6
0
        private static void Main(string[] args)
        {
            try
            {
                // In this example we are going to train a face detector based on the
                // small faces dataset in the examples/faces directory.  So the first
                // thing we do is load that dataset.  This means you need to supply the
                // path to this faces folder as a command line argument so we will know
                // where it is.
                if (args.Length != 1)
                {
                    Console.WriteLine("Give the path to the examples/faces directory as the argument to this");
                    Console.WriteLine("program.  For example, if you are in the examples folder then execute ");
                    Console.WriteLine("this program by running: ");
                    Console.WriteLine("   ./dnn_mmod_ex faces");
                    return;
                }

                var facesDirectory = args[0];

                // The faces directory contains a training dataset and a separate
                // testing dataset.  The training data consists of 4 images, each
                // annotated with rectangles that bound each human face.  The idea is
                // to use this training data to learn to identify human faces in new
                // images.
                //
                // Once you have trained an object detector it is always important to
                // test it on data it wasn't trained on.  Therefore, we will also load
                // a separate testing set of 5 images.  Once we have a face detector
                // created from the training data we will see how well it works by
                // running it on the testing images.
                //
                // So here we create the variables that will hold our dataset.
                // images_train will hold the 4 training images and face_boxes_train
                // holds the locations of the faces in the training images.  So for
                // example, the image images_train[0] has the faces given by the
                // rectangles in face_boxes_train[0].
                IList <Matrix <RgbPixel> > imagesTrain;
                IList <Matrix <RgbPixel> > imagesTest;
                IList <IList <MModRect> >  faceBoxesTrain;
                IList <IList <MModRect> >  faceBoxesTest;

                // Now we load the data.  These XML files list the images in each dataset
                // and also contain the positions of the face boxes.  Obviously you can use
                // any kind of input format you like so long as you store the data into
                // images_train and face_boxes_train.  But for convenience dlib comes with
                // tools for creating and loading XML image datasets.  Here you see how to
                // load the data.  To create the XML files you can use the imglab tool which
                // can be found in the tools/imglab folder.  It is a simple graphical tool
                // for labeling objects in images with boxes.  To see how to use it read the
                // tools/imglab/README.txt file.
                Dlib.LoadImageDataset(facesDirectory + "/training.xml", out imagesTrain, out faceBoxesTrain);
                Dlib.LoadImageDataset(facesDirectory + "/testing.xml", out imagesTest, out faceBoxesTest);

                Console.WriteLine($"num training images: {imagesTrain.Count()}");
                Console.WriteLine($"num testing images:  {imagesTest.Count()}");


                // The MMOD algorithm has some options you can set to control its behavior.  However,
                // you can also call the constructor with your training annotations and a "target
                // object size" and it will automatically configure itself in a reasonable way for your
                // problem.  Here we are saying that faces are still recognizably faces when they are
                // 40x40 pixels in size.  You should generally pick the smallest size where this is
                // true.  Based on this information the mmod_options constructor will automatically
                // pick a good sliding window width and height.  It will also automatically set the
                // non-max-suppression parameters to something reasonable.  For further details see the
                // mmod_options documentation.
                using (var options = new MModOptions(faceBoxesTrain, 40, 40))
                {
                    // The detector will automatically decide to use multiple sliding windows if needed.
                    // For the face data, only one is needed however.
                    var detectorWindows = options.DetectorWindows.ToArray();
                    Console.WriteLine($"num detector windows: {detectorWindows.Length}");
                    foreach (var w in detectorWindows)
                    {
                        Console.WriteLine($"detector window width by height: {w.Width} x {w.Height}");
                    }

                    Console.WriteLine($"overlap NMS IOU thresh:             {options.OverlapsNms.GetIouThresh()}");
                    Console.WriteLine($"overlap NMS percent covered thresh: {options.OverlapsNms.GetPercentCoveredThresh()}");

                    // Now we are ready to create our network and trainer.
                    using (var net = new LossMmod(options, 2))
                    {
                        // The MMOD loss requires that the number of filters in the final network layer equal
                        // options.detector_windows.size().  So we set that here as well.
                        using (var subnet = net.GetSubnet())
                            using (var details = subnet.GetLayerDetails())
                            {
                                details.SetNumFilters(detectorWindows.Length);
                                using (var trainer = new DnnTrainer <LossMmod>(net))
                                {
                                    trainer.SetLearningRate(0.1);
                                    trainer.BeVerbose();
                                    trainer.SetSynchronizationFile("mmod_sync", 5 * 60);
                                    trainer.SetIterationsWithoutProgressThreshold(300);

                                    // Now let's train the network.  We are going to use mini-batches of 150
                                    // images.   The images are random crops from our training set (see
                                    // random_cropper_ex.cpp for a discussion of the random_cropper).
                                    IEnumerable <Matrix <RgbPixel> > miniBatchSamples;
                                    //IEnumerable<IEnumerable<RgbPixel>> mini_batch_labels;
                                    IEnumerable <IEnumerable <MModRect> > miniBatchLabels;

                                    using (var cropper = new RandomCropper())
                                        using (var chipDims = new ChipDims(200, 200))
                                        {
                                            cropper.ChipDims = chipDims;
                                            // Usually you want to give the cropper whatever min sizes you passed to the
                                            // mmod_options constructor, which is what we do here.
                                            cropper.SetMinObjectSize(40, 40);

                                            using (var rnd = new Rand())
                                            {
                                                // Run the trainer until the learning rate gets small.  This will probably take several
                                                // hours.
                                                while (trainer.GetLearningRate() >= 1e-4)
                                                {
                                                    cropper.Operator(150, imagesTrain, faceBoxesTrain, out miniBatchSamples, out miniBatchLabels);
                                                    // We can also randomly jitter the colors and that often helps a detector
                                                    // generalize better to new images.
                                                    foreach (var img in miniBatchSamples)
                                                    {
                                                        Dlib.DisturbColors(img, rnd);
                                                    }

                                                    LossMmod.TrainOneStep(trainer, miniBatchSamples, miniBatchLabels);

                                                    miniBatchSamples.DisposeElement();
                                                    miniBatchLabels.DisposeElement();
                                                }
                                                // wait for training threads to stop
                                                trainer.GetNet();
                                                Console.WriteLine("done training");

                                                // Save the network to disk
                                                net.Clean();
                                                LossMmod.Serialize(net, "mmod_network.dat");


                                                // Now that we have a face detector we can test it.  The first statement tests it
                                                // on the training data.  It will print the precision, recall, and then average precision.
                                                // This statement should indicate that the network works perfectly on the
                                                // training data.
                                                using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTrain, faceBoxesTrain))
                                                    Console.WriteLine($"training results: {matrix}");
                                                // However, to get an idea if it really worked without overfitting we need to run
                                                // it on images it wasn't trained on.  The next line does this.   Happily,
                                                // this statement indicates that the detector finds most of the faces in the
                                                // testing data.
                                                using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTest, faceBoxesTest))
                                                    Console.WriteLine($"testing results:  {matrix}");


                                                // If you are running many experiments, it's also useful to log the settings used
                                                // during the training experiment.  This statement will print the settings we used to
                                                // the screen.
                                                Console.WriteLine($"{trainer}{cropper}");

                                                // Now lets run the detector on the testing images and look at the outputs.
                                                using (var win = new ImageWindow())
                                                    foreach (var img in imagesTest)
                                                    {
                                                        Dlib.PyramidUp(img);
                                                        var dets = net.Operator(img);
                                                        win.ClearOverlay();
                                                        win.SetImage(img);
                                                        foreach (var d in dets[0])
                                                        {
                                                            win.AddOverlay(d);
                                                        }

                                                        Console.ReadKey();

                                                        foreach (var det in dets)
                                                        {
                                                            foreach (var d in det)
                                                            {
                                                                d.Dispose();
                                                            }
                                                        }
                                                    }

                                                // Now that you finished this example, you should read dnn_mmod_train_find_cars_ex.cpp,
                                                // which is a more advanced example.  It discusses many issues surrounding properly
                                                // setting the MMOD parameters and creating a good training dataset.
                                            }
                                        }
                                }
                            }
                    }

                    detectorWindows.DisposeElement();
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }
        }