public void Deserialize2() { var path = Path.Combine(this.ModelDirectory, "mmod_human_face_detector.dat"); using (var loss = LossMmod.Deserialize(File.ReadAllBytes(path))) Assert.Equal(21, loss.NumLayers); }
/// <summary> /// Initializes a new instance of the <see cref="FaceRecognition"/> class with the directory path that stores model files. /// </summary> /// <param name="directory">The directory path that stores model files.</param> /// <exception cref="FileNotFoundException">The model file is not found.</exception> /// <exception cref="DirectoryNotFoundException">The specified directory path is not found.</exception> private FaceRecognition(string directory) { if (!Directory.Exists(directory)) { throw new DirectoryNotFoundException(directory); } var predictor68PointModel = Path.Combine(directory, FaceRecognitionModels.GetPosePredictorModelLocation()); if (!File.Exists(predictor68PointModel)) { throw new FileNotFoundException(predictor68PointModel); } var predictor5PointModel = Path.Combine(directory, FaceRecognitionModels.GetPosePredictorFivePointModelLocation()); if (!File.Exists(predictor5PointModel)) { throw new FileNotFoundException(predictor5PointModel); } var cnnFaceDetectionModel = Path.Combine(directory, FaceRecognitionModels.GetCnnFaceDetectorModelLocation()); if (!File.Exists(cnnFaceDetectionModel)) { throw new FileNotFoundException(cnnFaceDetectionModel); } var faceRecognitionModel = Path.Combine(directory, FaceRecognitionModels.GetFaceRecognitionModelLocation()); if (!File.Exists(faceRecognitionModel)) { throw new FileNotFoundException(faceRecognitionModel); } this._FaceDetector?.Dispose(); this._FaceDetector = DlibDotNet.Dlib.GetFrontalFaceDetector(); this._PosePredictor68Point?.Dispose(); this._PosePredictor68Point = ShapePredictor.Deserialize(predictor68PointModel); this._PosePredictor5Point?.Dispose(); this._PosePredictor5Point = ShapePredictor.Deserialize(predictor5PointModel); this._CnnFaceDetector?.Dispose(); this._CnnFaceDetector = LossMmod.Deserialize(cnnFaceDetectionModel); this._FaceEncoder?.Dispose(); this._FaceEncoder = LossMetric.Deserialize(faceRecognitionModel); var predictor194PointModel = Path.Combine(directory, FaceRecognitionModels.GetPosePredictor194PointModelLocation()); if (File.Exists(predictor194PointModel)) { this._PosePredictor194Point?.Dispose(); this._PosePredictor194Point = ShapePredictor.Deserialize(predictor194PointModel); } }
public static IEnumerable <IEnumerable <MModRect> > DetectMulti(LossMmod net, IEnumerable <Image> images, int upsampleNumTimes, int batchSize = 128) { var dimgs = new List <Matrix <RgbPixel> >(); var allRects = new List <IEnumerable <MModRect> >(); using (var pyr = new PyramidDown(2)) { // Copy the data into dlib based objects foreach (var matrix in images) { using (var image = new Matrix <RgbPixel>()) { var type = matrix.Matrix.MatrixElementType; switch (type) { case MatrixElementTypes.UInt8: case MatrixElementTypes.RgbPixel: DlibDotNet.Dlib.AssignImage(matrix.Matrix, image); break; default: throw new NotSupportedException("Unsupported image type, must be 8bit gray or RGB image."); } for (var i = 0; i < upsampleNumTimes; i++) { DlibDotNet.Dlib.PyramidUp(image); } dimgs.Add(image); for (var i = 1; i < dimgs.Count; i++) { if (dimgs[i - 1].Columns != dimgs[i].Columns || dimgs[i - 1].Rows != dimgs[i].Rows) { throw new ArgumentException("Images in list must all have the same dimensions."); } } var dets = net.Operator(dimgs, (ulong)batchSize); foreach (var det in dets) { var rects = new List <MModRect>(); foreach (var d in det) { var drect = pyr.RectDown(new DRectangle(d.Rect), (uint)upsampleNumTimes); d.Rect = new Rectangle((int)drect.Left, (int)drect.Top, (int)drect.Right, (int)drect.Bottom); rects.Add(d); } allRects.Add(rects); } } } } return(allRects); }
public void Create() { var networkIds = Enumerable.Range(0, 4); foreach (var networkId in networkIds) { using (var loss = new LossMmod(networkId)) Assert.True(!loss.IsDisposed); } }
private static void Main() { try { // You can get this file from http://dlib.net/files/mmod_front_and_rear_end_vehicle_detector.dat.bz2 // This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program. // As you can see, the file also includes a separately trained shape_predictor. To see // a generic example of how to train those refer to train_shape_predictor_ex.cpp. using (var deserialize = new ProxyDeserialize("mmod_front_and_rear_end_vehicle_detector.dat")) using (var net = LossMmod.Deserialize(deserialize, 1)) using (var sp = ShapePredictor.Deserialize(deserialize)) using (var img = Dlib.LoadImageAsMatrix <RgbPixel>("mmod_cars_test_image2.jpg")) using (var win = new ImageWindow()) { win.SetImage(img); // Run the detector on the image and show us the output. var dets = net.Operator(img).First(); foreach (var d in dets) { // We use a shape_predictor to refine the exact shape and location of the detection // box. This shape_predictor is trained to simply output the 4 corner points of // the box. So all we do is make a rectangle that tightly contains those 4 points // and that rectangle is our refined detection position. var fd = sp.Detect(img, d); var rect = Rectangle.Empty; for (var j = 0u; j < fd.Parts; ++j) { rect += fd.GetPart(j); } if (d.Label == "rear") { win.AddOverlay(rect, new RgbPixel(255, 0, 0), d.Label); } else { win.AddOverlay(rect, new RgbPixel(255, 255, 0), d.Label); } } Console.WriteLine("Hit enter to end program"); Console.ReadKey(); } } catch (ImageLoadException ile) { Console.WriteLine(ile.Message); Console.WriteLine("The test image is located in the examples folder. So you should run this program from a sub folder so that the relative path is correct."); } catch (Exception e) { Console.WriteLine(e); } }
public static void SetAllBnRunningStatsWindowSizes(LossMmod net, uint newWindowSize) { if (net == null) { throw new ArgumentNullException(nameof(net)); } net.ThrowIfDisposed(); var ret = NativeMethods.set_all_bn_running_stats_window_sizes_loss_mmod(net.NativePtr, net.NetworkType, newWindowSize); if (ret == NativeMethods.ErrorType.DnnNotSupportNetworkType) { throw new NotSupportNetworkTypeException(net.NetworkType); } }
public static IEnumerable <MModRect> Detect(LossMmod net, Image image, int upsampleNumTimes) { using (var pyr = new PyramidDown(2)) { var rects = new List <MModRect>(); // Copy the data into dlib based objects using (var matrix = new Matrix <RgbPixel>()) { var type = image.Mode; switch (type) { case Mode.Greyscale: case Mode.Rgb: DlibDotNet.Dlib.AssignImage(image.Matrix, matrix); break; default: throw new NotSupportedException("Unsupported image type, must be 8bit gray or RGB image."); } // Upsampling the image will allow us to detect smaller faces but will cause the // program to use more RAM and run longer. var levels = upsampleNumTimes; while (levels > 0) { levels--; DlibDotNet.Dlib.PyramidUp <PyramidDown>(matrix, 2); } var dets = net.Operator(matrix); // Scale the detection locations back to the original image size // if the image was upscaled. foreach (var d in dets.First()) { var drect = pyr.RectDown(new DRectangle(d.Rect), (uint)upsampleNumTimes); d.Rect = new Rectangle((int)drect.Left, (int)drect.Top, (int)drect.Right, (int)drect.Bottom); rects.Add(d); } return(rects); } } }
public void Operator() { var image = this.GetDataFile("Lenna.jpg"); var path = Path.Combine(this.ModelDirectory, "mmod_human_face_detector.dat"); using (var net1 = LossMmod.Deserialize(path)) using (var net2 = LossMmod.Deserialize(File.ReadAllBytes(path))) using (var matrix = Dlib.LoadImageAsMatrix <RgbPixel>(image.FullName)) using (var ret1 = net1.Operator(matrix)) using (var ret2 = net2.Operator(matrix)) { Assert.Equal(1, ret1.Count); Assert.Equal(1, ret2.Count); var r1 = ret1[0].ToArray(); var r2 = ret2[0].ToArray(); Assert.Equal(r1.Length, r2.Length); Assert.Equal(r1[0].Rect.Left, r2[0].Rect.Left); Assert.Equal(r1[0].Rect.Right, r2[0].Rect.Right); Assert.Equal(r1[0].Rect.Top, r2[0].Rect.Top); Assert.Equal(r1[0].Rect.Bottom, r2[0].Rect.Bottom); } }
private static int Main(string[] args) { if (args.Length < 1) { Console.WriteLine("To run this program you need a copy of the PASCAL VOC2012 dataset."); Console.WriteLine(); Console.WriteLine("You call this program like this: "); Console.WriteLine("./dnn_instance_segmentation_train_ex /path/to/VOC2012 [det-minibatch-size] [seg-minibatch-size] [class-1] [class-2] [class-3] ..."); return(1); } try { Console.WriteLine("\nSCANNING PASCAL VOC2012 DATASET"); Console.WriteLine(); var listing = PascalVOC2012.GetPascalVoc2012TrainListing(args[0]).ToArray(); Console.WriteLine($"images in entire dataset: {listing.Length}"); if (listing.Length == 0) { Console.WriteLine("Didn't find the VOC2012 dataset. "); return(1); } // mini-batches smaller than the default can be used with GPUs having less memory var argc = args.Length; var detMiniBatchSize = argc >= 2 ? int.Parse(args[1]) : 35; var segMiniBatchSize = argc >= 3 ? int.Parse(args[2]) : 100; Console.WriteLine($"det mini-batch size: {detMiniBatchSize}"); Console.WriteLine($"seg mini-batch size: {segMiniBatchSize}"); var desiredClassLabels = new List <string>(); for (var arg = 3; arg < argc; ++arg) { desiredClassLabels.Add(args[arg]); } if (!desiredClassLabels.Any()) { desiredClassLabels.Add("bicycle"); desiredClassLabels.Add("car"); desiredClassLabels.Add("cat"); } Console.Write("desired classlabels:"); foreach (var desiredClassLabel in desiredClassLabels) { Console.Write($" {desiredClassLabel}"); } Console.WriteLine(); // extract the MMOD rects Console.Write("\nExtracting all truth instances..."); var truthInstances = LoadAllTruthInstances(listing); Console.WriteLine(" Done!"); Console.WriteLine(); if (listing.Length != truthInstances.Count) { throw new ApplicationException(); } var originalTruthImages = new List <TruthImage>(); for (int i = 0, end = listing.Length; i < end; ++i) { originalTruthImages.Add(new TruthImage { Info = listing[i], TruthInstances = truthInstances[i] }); } var truthImagesFilteredByClass = FilterBasedOnClassLabel(originalTruthImages, desiredClassLabels); Console.WriteLine($"images in dataset filtered by class: {truthImagesFilteredByClass.Count}"); IgnoreSomeTruthBoxes(truthImagesFilteredByClass); var truthImages = FilterImagesWithNoTruth(truthImagesFilteredByClass); Console.WriteLine($"images in dataset after ignoring some truth boxes: {truthImages.Count}"); // First train an object detector network (loss_mmod). Console.WriteLine("\nTraining detector network:"); var detNet = TrainDetectionNetwork(truthImages, (uint)detMiniBatchSize); // Then train mask predictors (segmentation). var segNetsByClass = new Dictionary <string, LossMulticlassLogPerPixel>(); // This flag controls if a separate mask predictor is trained for each class. // Note that it would also be possible to train a separate mask predictor for // class groups, each containing somehow similar classes -- for example, one // mask predictor for cars and buses, another for cats and dogs, and so on. const bool separateSegNetForEachClass = true; if (separateSegNetForEachClass) { foreach (var classLabel in desiredClassLabels) { // Consider only the truth images belonging to this class. var classImages = FilterBasedOnClassLabel(truthImages, new[] { classLabel }); Console.WriteLine($"\nTraining segmentation network for class {classLabel}:"); segNetsByClass[classLabel] = TrainSegmentationNetwork(classImages, (uint)segMiniBatchSize, classLabel); } } else { Console.WriteLine("Training a single segmentation network:"); segNetsByClass[""] = TrainSegmentationNetwork(truthImages, (uint)segMiniBatchSize, ""); } Console.WriteLine("Saving networks"); using (var proxy = new ProxySerialize(InstanceSegmentationNetFilename)) { LossMmod.Serialize(proxy, detNet); segNetsByClass.Serialize(proxy, 4); } } catch (Exception e) { Console.WriteLine(e); } return(0); }
private static void Main(string[] args) { if (args.Length != 1) { Console.WriteLine("You call this program like this: "); Console.WriteLine("./dnn_instance_segmentation_train_ex /path/to/images"); Console.WriteLine(); Console.WriteLine($"You will also need a trained '{InstanceSegmentationNetFilename}' file."); Console.WriteLine("You can either train it yourself (see example program"); Console.WriteLine("dnn_instance_segmentation_train_ex), or download a"); Console.WriteLine($"copy from here: http://dlib.net/files/{InstanceSegmentationNetFilename}"); return; } try { // Read the file containing the trained network from the working directory. using (var deserialize = new ProxyDeserialize(InstanceSegmentationNetFilename)) using (var detNet = LossMmod.Deserialize(deserialize, 4)) { var segNetsByClass = new Dictionary <string, LossMulticlassLogPerPixel>(); segNetsByClass.Deserialize(deserialize, 4); // Show inference results in a window. using (var win = new ImageWindow()) { // Find supported image files. var files = Directory.GetFiles(args[0]) .Where(s => s.EndsWith(".jpeg") || s.EndsWith(".jpg") || s.EndsWith(".png")).ToArray(); using (var rnd = new Rand()) { Console.WriteLine($"Found {files.Length} images, processing..."); foreach (var file in files.Select(s => new FileInfo(s))) { // Load the input image. using (var inputImage = Dlib.LoadImageAsMatrix <RgbPixel>(file.FullName)) { // Create predictions for each pixel. At this point, the type of each prediction // is an index (a value between 0 and 20). Note that the net may return an image // that is not exactly the same size as the input. using (var output = detNet.Operator(inputImage)) { var instances = output.First().ToList(); instances.Sort((lhs, rhs) => (int)lhs.Rect.Area - (int)rhs.Rect.Area); using (var rgbLabelImage = new Matrix <RgbPixel>()) { rgbLabelImage.SetSize(inputImage.Rows, inputImage.Columns); rgbLabelImage.Assign(Enumerable.Range(0, rgbLabelImage.Size).Select(i => new RgbPixel(0, 0, 0)).ToArray()); var foundSomething = false; foreach (var instance in instances) { if (!foundSomething) { Console.Write("Found "); foundSomething = true; } else { Console.Write(", "); } Console.Write(instance.Label); var croppingRect = GetCroppingRect(instance.Rect); using (var dims = new ChipDims(SegDim, SegDim)) using (var chipDetails = new ChipDetails(croppingRect, dims)) using (var inputChip = Dlib.ExtractImageChip <RgbPixel>(inputImage, chipDetails, InterpolationTypes.Bilinear)) { if (!segNetsByClass.TryGetValue(instance.Label, out var i)) { // per-class segmentation net not found, so we must be using the same net for all classes // (see bool separate_seg_net_for_each_class in dnn_instance_segmentation_train_ex.cpp) if (segNetsByClass.Count == 1) { throw new ApplicationException(); } if (string.IsNullOrEmpty(segNetsByClass.First().Key)) { throw new ApplicationException(); } } var segNet = i != null ? i // use the segmentation net trained for this class : segNetsByClass.First().Value; // use the same segmentation net for all classes using (var mask = segNet.Operator(inputChip)) { var randomColor = new RgbPixel( rnd.GetRandom8BitNumber(), rnd.GetRandom8BitNumber(), rnd.GetRandom8BitNumber() ); using (var resizedMask = new Matrix <ushort>((int)chipDetails.Rect.Height, (int)chipDetails.Rect.Width)) { Dlib.ResizeImage(mask.First(), resizedMask); for (int r = 0, nr = resizedMask.Rows; r < nr; ++r) { for (int c = 0, nc = resizedMask.Columns; c < nc; ++c) { if (resizedMask[r, c] != 0) { var y = (int)(chipDetails.Rect.Top + r); var x = (int)(chipDetails.Rect.Left + c); if (y >= 0 && y < rgbLabelImage.Rows && x >= 0 && x < rgbLabelImage.Columns) { rgbLabelImage[y, x] = randomColor; } } } } } var voc2012Class = PascalVOC2012.FindVoc2012Class(instance.Label); Dlib.DrawRectangle(rgbLabelImage, instance.Rect, voc2012Class.RgbLabel, 1u); } } } instances.DisposeElement(); using (var tmp = Dlib.JoinRows(inputImage, rgbLabelImage)) { // Show the input image on the left, and the predicted RGB labels on the right. win.SetImage(tmp); if (instances.Any()) { Console.Write($" in {file.Name} - hit enter to process the next image"); Console.ReadKey(); } } } } } } } } foreach (var kvp in segNetsByClass) { kvp.Value.Dispose(); } } } catch (Exception e) { Console.WriteLine(e); } }
private static void Main(string[] args) { try { if (args.Length != 2) { Console.WriteLine("Call this program like this:"); Console.WriteLine("./dnn_mmod_dog_hipsterizer mmod_dog_hipsterizer.dat faces/dogs.jpg"); Console.WriteLine("You can get the mmod_dog_hipsterizer.dat file from:"); Console.WriteLine("http://dlib.net/files/mmod_dog_hipsterizer.dat.bz2"); return; } // load the models as well as glasses and mustache. using (var deserialize = new ProxyDeserialize(args[0])) using (var net = LossMmod.Deserialize(deserialize)) using (var sp = ShapePredictor.Deserialize(deserialize)) using (var glasses = Matrix <RgbAlphaPixel> .Deserialize(deserialize)) using (var mustache = Matrix <RgbAlphaPixel> .Deserialize(deserialize)) { Dlib.PyramidUp(glasses); Dlib.PyramidUp(mustache); using (var win1 = new ImageWindow(glasses)) using (var win2 = new ImageWindow(mustache)) using (var winWireframe = new ImageWindow()) using (var winHipster = new ImageWindow()) { // Now process each image, find dogs, and hipsterize them by drawing glasses and a // mustache on each dog :) for (var i = 1; i < args.Length; ++i) { using (var img = Dlib.LoadImageAsMatrix <RgbPixel>(args[i])) { // Upsampling the image will allow us to find smaller dog faces but will use more // computational resources. //pyramid_up(img); var dets = net.Operator(img).First(); winWireframe.ClearOverlay(); winWireframe.SetImage(img); // We will also draw a wireframe on each dog's face so you can see where the // shape_predictor is identifying face landmarks. var lines = new List <ImageWindow.OverlayLine>(); foreach (var d in dets) { // get the landmarks for this dog's face var shape = sp.Detect(img, d.Rect); var color = new RgbPixel(0, 255, 0); var top = shape.GetPart(0); var leftEar = shape.GetPart(1); var leftEye = shape.GetPart(2); var nose = shape.GetPart(3); var rightEar = shape.GetPart(4); var rightEye = shape.GetPart(5); // The locations of the left and right ends of the mustache. var leftMustache = 1.3 * (leftEye - rightEye) / 2 + nose; var rightMustache = 1.3 * (rightEye - leftEye) / 2 + nose; // Draw the glasses onto the image. var from = new[] { 2 * new Point(176, 36), 2 * new Point(59, 35) }; var to = new[] { leftEye, rightEye }; using (var transform = Dlib.FindSimilarityTransform(from, to)) for (uint r = 0, nr = (uint)glasses.Rows; r < nr; ++r) { for (uint c = 0, nc = (uint)glasses.Columns; c < nc; ++c) { var p = (Point)transform.Operator(new DPoint(c, r)); if (Dlib.GetRect(img).Contains(p)) { var rgb = img[p.Y, p.X]; Dlib.AssignPixel(ref rgb, glasses[(int)r, (int)c]); img[p.Y, p.X] = rgb; } } } // Draw the mustache onto the image right under the dog's nose. var mustacheRect = Dlib.GetRect(mustache); from = new[] { mustacheRect.TopLeft, mustacheRect.TopRight }; to = new[] { rightMustache, leftMustache }; using (var transform = Dlib.FindSimilarityTransform(from, to)) for (uint r = 0, nr = (uint)mustache.Rows; r < nr; ++r) { for (uint c = 0, nc = (uint)mustache.Columns; c < nc; ++c) { var p = (Point)transform.Operator(new DPoint(c, r)); if (Dlib.GetRect(img).Contains(p)) { var rgb = img[p.Y, p.X]; Dlib.AssignPixel(ref rgb, mustache[(int)r, (int)c]); img[p.Y, p.X] = rgb; } } } // Record the lines needed for the face wire frame. lines.Add(new ImageWindow.OverlayLine(leftEye, nose, color)); lines.Add(new ImageWindow.OverlayLine(nose, rightEye, color)); lines.Add(new ImageWindow.OverlayLine(rightEye, leftEye, color)); lines.Add(new ImageWindow.OverlayLine(rightEye, rightEar, color)); lines.Add(new ImageWindow.OverlayLine(rightEar, top, color)); lines.Add(new ImageWindow.OverlayLine(top, leftEar, color)); lines.Add(new ImageWindow.OverlayLine(leftEar, leftEye, color)); winWireframe.AddOverlay(lines); winHipster.SetImage(img); } Console.WriteLine("Hit enter to process the next image."); Console.ReadKey(); } } } } } catch (Exception e) { Console.WriteLine(e); } }
private static void Main(string[] args) { try { if (args.Length != 1) { Console.WriteLine("Give the path to a folder containing training.xml and testing.xml files."); Console.WriteLine("This example program is specifically designed to run on the dlib vehicle "); Console.WriteLine("detection dataset, which is available at this URL: "); Console.WriteLine(" http://dlib.net/files/data/dlib_rear_end_vehicles_v1.tar"); Console.WriteLine(); Console.WriteLine("So download that dataset, extract it somewhere, and then run this program"); Console.WriteLine("with the dlib_rear_end_vehicles folder as an argument. E.g. if you extract"); Console.WriteLine("the dataset to the current folder then you should run this example program"); Console.WriteLine("by typing: "); Console.WriteLine(" ./dnn_mmod_train_find_cars_ex dlib_rear_end_vehicles"); Console.WriteLine(); Console.WriteLine("It takes about a day to finish if run on a high end GPU like a 1080ti."); Console.WriteLine(); return; } var dataDirectory = args[0]; IList <Matrix <RgbPixel> > imagesTrain; IList <Matrix <RgbPixel> > imagesTest; IList <IList <MModRect> > boxesTrain; IList <IList <MModRect> > boxesTest; Dlib.LoadImageDataset(dataDirectory + "/training.xml", out imagesTrain, out boxesTrain); Dlib.LoadImageDataset(dataDirectory + "/testing.xml", out imagesTest, out boxesTest); // When I was creating the dlib vehicle detection dataset I had to label all the cars // in each image. MMOD requires all cars to be labeled, since any unlabeled part of an // image is implicitly assumed to be not a car, and the algorithm will use it as // negative training data. So every car must be labeled, either with a normal // rectangle or an "ignore" rectangle that tells MMOD to simply ignore it (i.e. neither // treat it as a thing to detect nor as negative training data). // // In our present case, many images contain very tiny cars in the distance, ones that // are essentially just dark smudges. It's not reasonable to expect the CNN // architecture we defined to detect such vehicles. However, I erred on the side of // having more complete annotations when creating the dataset. So when I labeled these // images I labeled many of these really difficult cases as vehicles to detect. // // So the first thing we are going to do is clean up our dataset a little bit. In // particular, we are going to mark boxes smaller than 35*35 pixels as ignore since // only really small and blurry cars appear at those sizes. We will also mark boxes // that are heavily overlapped by another box as ignore. We do this because we want to // allow for stronger non-maximum suppression logic in the learned detector, since that // will help make it easier to learn a good detector. // // To explain this non-max suppression idea further it's important to understand how // the detector works. Essentially, sliding window detectors scan all image locations // and ask "is there a car here?". If there really is a car in a specific location in // an image then usually many slightly different sliding window locations will produce // high detection scores, indicating that there is a car at those locations. If we // just stopped there then each car would produce multiple detections. But that isn't // what we want. We want each car to produce just one detection. So it's common for // detectors to include "non-maximum suppression" logic which simply takes the // strongest detection and then deletes all detections "close to" the strongest. This // is a simple post-processing step that can eliminate duplicate detections. However, // we have to define what "close to" means. We can do this by looking at your training // data and checking how close the closest target boxes are to each other, and then // picking a "close to" measure that doesn't suppress those target boxes but is // otherwise as tight as possible. This is exactly what the mmod_options object does // by default. // // Importantly, this means that if your training dataset contains an image with two // target boxes that really overlap a whole lot, then the non-maximum suppression // "close to" measure will be configured to allow detections to really overlap a whole // lot. On the other hand, if your dataset didn't contain any overlapped boxes at all, // then the non-max suppression logic would be configured to filter out any boxes that // overlapped at all, and thus would be performing a much stronger non-max suppression. // // Why does this matter? Well, remember that we want to avoid duplicate detections. // If non-max suppression just kills everything in a really wide area around a car then // the CNN doesn't really need to learn anything about avoiding duplicate detections. // However, if non-max suppression only suppresses a tiny area around each detection // then the CNN will need to learn to output small detection scores for those areas of // the image not suppressed. The smaller the non-max suppression region the more the // CNN has to learn and the more difficult the learning problem will become. This is // why we remove highly overlapped objects from the training dataset. That is, we do // it so the non-max suppression logic will be able to be reasonably effective. Here // we are ensuring that any boxes that are entirely contained by another are // suppressed. We also ensure that boxes with an intersection over union of 0.5 or // greater are suppressed. This will improve the resulting detector since it will be // able to use more aggressive non-max suppression settings. var numOverlappedIgnoredTest = 0; foreach (var v in boxesTest) { using (var overlap = new TestBoxOverlap(0.50, 0.95)) numOverlappedIgnoredTest += IgnoreOverlappedBoxes(v, overlap); } var numOverlappedIgnored = 0; var numAdditionalIgnored = 0; foreach (var v in boxesTrain) { using (var overlap = new TestBoxOverlap(0.50, 0.95)) numOverlappedIgnored += IgnoreOverlappedBoxes(v, overlap); foreach (var bb in v) { if (bb.Rect.Width < 35 && bb.Rect.Height < 35) { if (!bb.Ignore) { bb.Ignore = true; ++numAdditionalIgnored; } } // The dlib vehicle detection dataset doesn't contain any detections with // really extreme aspect ratios. However, some datasets do, often because of // bad labeling. So it's a good idea to check for that and either eliminate // those boxes or set them to ignore. Although, this depends on your // application. // // For instance, if your dataset has boxes with an aspect ratio // of 10 then you should think about what that means for the network // architecture. Does the receptive field even cover the entirety of the box // in those cases? Do you care about these boxes? Are they labeling errors? // I find that many people will download some dataset from the internet and // just take it as given. They run it through some training algorithm and take // the dataset as unchallengeable truth. But many datasets are full of // labeling errors. There are also a lot of datasets that aren't full of // errors, but are annotated in a sloppy and inconsistent way. Fixing those // errors and inconsistencies can often greatly improve models trained from // such data. It's almost always worth the time to try and improve your // training dataset. // // In any case, my point is that there are other types of dataset cleaning you // could put here. What exactly you need depends on your application. But you // should carefully consider it and not take your dataset as a given. The work // of creating a good detector is largely about creating a high quality // training dataset. } } // When modifying a dataset like this, it's a really good idea to print a log of how // many boxes you ignored. It's easy to accidentally ignore a huge block of data, so // you should always look and see that things are doing what you expect. Console.WriteLine($"num_overlapped_ignored: {numOverlappedIgnored}"); Console.WriteLine($"num_additional_ignored: {numAdditionalIgnored}"); Console.WriteLine($"num_overlapped_ignored_test: {numOverlappedIgnoredTest}"); Console.WriteLine($"num training images: {imagesTrain.Count()}"); Console.WriteLine($"num testing images: {imagesTest.Count()}"); // Our vehicle detection dataset has basically 3 different types of boxes. Square // boxes, tall and skinny boxes (e.g. semi trucks), and short and wide boxes (e.g. // sedans). Here we are telling the MMOD algorithm that a vehicle is recognizable as // long as the longest box side is at least 70 pixels long and the shortest box side is // at least 30 pixels long. mmod_options will use these parameters to decide how large // each of the sliding windows needs to be so as to be able to detect all the vehicles. // Since our dataset has basically these 3 different aspect ratios, it will decide to // use 3 different sliding windows. This means the final con layer in the network will // have 3 filters, one for each of these aspect ratios. // // Another thing to consider when setting the sliding window size is the "stride" of // your network. The network we defined above downsamples the image by a factor of 8x // in the first few layers. So when the sliding windows are scanning the image, they // are stepping over it with a stride of 8 pixels. If you set the sliding window size // too small then the stride will become an issue. For instance, if you set the // sliding window size to 4 pixels, then it means a 4x4 window will be moved by 8 // pixels at a time when scanning. This is obviously a problem since 75% of the image // won't even be visited by the sliding window. So you need to set the window size to // be big enough relative to the stride of your network. In our case, the windows are // at least 30 pixels in length, so being moved by 8 pixel steps is fine. using (var options = new MModOptions(boxesTrain, 70, 30)) { // This setting is very important and dataset specific. The vehicle detection dataset // contains boxes that are marked as "ignore", as we discussed above. Some of them are // ignored because we set ignore to true in the above code. However, the xml files // also contained a lot of ignore boxes. Some of them are large boxes that encompass // large parts of an image and the intention is to have everything inside those boxes // be ignored. Therefore, we need to tell the MMOD algorithm to do that, which we do // by setting options.overlaps_ignore appropriately. // // But first, we need to understand exactly what this option does. The MMOD loss // is essentially counting the number of false alarms + missed detections produced by // the detector for each image. During training, the code is running the detector on // each image in a mini-batch and looking at its output and counting the number of // mistakes. The optimizer tries to find parameters settings that minimize the number // of detector mistakes. // // This overlaps_ignore option allows you to tell the loss that some outputs from the // detector should be totally ignored, as if they never happened. In particular, if a // detection overlaps a box in the training data with ignore==true then that detection // is ignored. This overlap is determined by calling // options.overlaps_ignore(the_detection, the_ignored_training_box). If it returns // true then that detection is ignored. // // You should read the documentation for test_box_overlap, the class type for // overlaps_ignore for full details. However, the gist is that the default behavior is // to only consider boxes as overlapping if their intersection over union is > 0.5. // However, the dlib vehicle detection dataset contains large boxes that are meant to // mask out large areas of an image. So intersection over union isn't an appropriate // way to measure "overlaps with box" in this case. We want any box that is contained // inside one of these big regions to be ignored, even if the detection box is really // small. So we set overlaps_ignore to behave that way with this line. options.OverlapsIgnore = new TestBoxOverlap(0.5, 0.95); using (var net = new LossMmod(options, 3)) { // The final layer of the network must be a con layer that contains // options.detector_windows.size() filters. This is because these final filters are // what perform the final "sliding window" detection in the network. For the dlib // vehicle dataset, there will be 3 sliding window detectors, so we will be setting // num_filters to 3 here. var detectorWindows = options.DetectorWindows.ToArray(); using (var subnet = net.GetSubnet()) using (var details = subnet.GetLayerDetails()) { details.SetNumFilters(detectorWindows.Length); using (var trainer = new DnnTrainer <LossMmod>(net)) { trainer.SetLearningRate(0.1); trainer.BeVerbose(); // While training, we are going to use early stopping. That is, we will be checking // how good the detector is performing on our test data and when it stops getting // better on the test data we will drop the learning rate. We will keep doing that // until the learning rate is less than 1e-4. These two settings tell the trainer to // do that. Essentially, we are setting the first argument to infinity, and only the // test iterations without progress threshold will matter. In particular, it says that // once we observe 1000 testing mini-batches where the test loss clearly isn't // decreasing we will lower the learning rate. trainer.SetIterationsWithoutProgressThreshold(50000); trainer.SetTestIterationsWithoutProgressThreshold(1000); const string syncFilename = "mmod_cars_sync"; trainer.SetSynchronizationFile(syncFilename, 5 * 60); IEnumerable <Matrix <RgbPixel> > mini_batch_samples; IEnumerable <IEnumerable <MModRect> > mini_batch_labels; using (var cropper = new RandomCropper()) { cropper.SetSeed(0); cropper.SetChipDims(350, 350); // Usually you want to give the cropper whatever min sizes you passed to the // mmod_options constructor, or very slightly smaller sizes, which is what we do here. cropper.SetMinObjectSize(69, 28); cropper.MaxRotationDegrees = 2; using (var rnd = new Rand()) { // Log the training parameters to the console Console.WriteLine($"{trainer}{cropper}"); var cnt = 1; // Run the trainer until the learning rate gets small. while (trainer.GetLearningRate() >= 1e-4) { // Every 30 mini-batches we do a testing mini-batch. if (cnt % 30 != 0 || !imagesTest.Any()) { cropper.Operator(87, imagesTrain, boxesTrain, out mini_batch_samples, out mini_batch_labels); // We can also randomly jitter the colors and that often helps a detector // generalize better to new images. foreach (var img in mini_batch_samples) { Dlib.DisturbColors(img, rnd); } // It's a good idea to, at least once, put code here that displays the images // and boxes the random cropper is generating. You should look at them and // think about if the output makes sense for your problem. Most of the time // it will be fine, but sometimes you will realize that the pattern of cropping // isn't really appropriate for your problem and you will need to make some // change to how the mini-batches are being generated. Maybe you will tweak // some of the cropper's settings, or write your own entirely separate code to // create mini-batches. But either way, if you don't look you will never know. // An easy way to do this is to create a dlib::image_window to display the // images and boxes. LossMmod.TrainOneStep(trainer, mini_batch_samples, mini_batch_labels); mini_batch_samples.DisposeElement(); mini_batch_labels.DisposeElement(); } else { cropper.Operator(87, imagesTest, boxesTest, out mini_batch_samples, out mini_batch_labels); // We can also randomly jitter the colors and that often helps a detector // generalize better to new images. foreach (var img in mini_batch_samples) { Dlib.DisturbColors(img, rnd); } LossMmod.TestOneStep(trainer, mini_batch_samples, mini_batch_labels); mini_batch_samples.DisposeElement(); mini_batch_labels.DisposeElement(); } ++cnt; } // wait for training threads to stop trainer.GetNet(); Console.WriteLine("done training"); // Save the network to disk net.Clean(); LossMmod.Serialize(net, "mmod_rear_end_vehicle_detector.dat"); // It's a really good idea to print the training parameters. This is because you will // invariably be running multiple rounds of training and should be logging the output // to a file. This print statement will include many of the training parameters in // your log. Console.WriteLine($"{trainer}{cropper}"); Console.WriteLine($"\nsync_filename: {syncFilename}"); Console.WriteLine($"num training images: {imagesTrain.Count()}"); using (var _ = new TestBoxOverlap()) using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTrain, boxesTrain, _, 0, options.OverlapsIgnore)) Console.WriteLine($"training results: {matrix}"); // Upsampling the data will allow the detector to find smaller cars. Recall that // we configured it to use a sliding window nominally 70 pixels in size. So upsampling // here will let it find things nominally 35 pixels in size. Although we include a // limit of 1800*1800 here which means "don't upsample an image if it's already larger // than 1800*1800". We do this so we don't run out of RAM, which is a concern because // some of the images in the dlib vehicle dataset are really high resolution. Dlib.UpsampleImageDataset(2, imagesTrain, boxesTrain, 1800 * 1800); using (var _ = new TestBoxOverlap()) using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTrain, boxesTrain, _, 0, options.OverlapsIgnore)) Console.WriteLine($"training upsampled results: {matrix}"); Console.WriteLine("num testing images: {images_test.Count()}"); using (var _ = new TestBoxOverlap()) using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTest, boxesTest, _, 0, options.OverlapsIgnore)) Console.WriteLine($"testing results: {matrix}"); Dlib.UpsampleImageDataset(2, imagesTest, boxesTest, 1800 * 1800); using (var _ = new TestBoxOverlap()) using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTest, boxesTest, _, 0, options.OverlapsIgnore)) Console.WriteLine($"testing upsampled results: {matrix}"); /* * This program takes many hours to execute on a high end GPU. It took about a day to * train on a NVIDIA 1080ti. The resulting model file is available at * http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 * It should be noted that this file on dlib.net has a dlib::shape_predictor appended * onto the end of it (see dnn_mmod_find_cars_ex.cpp for an example of its use). This * explains why the model file on dlib.net is larger than the * mmod_rear_end_vehicle_detector.dat output by this program. * * You can see some videos of this vehicle detector running on YouTube: * https://www.youtube.com/watch?v=4B3bzmxMAZU * https://www.youtube.com/watch?v=bP2SUo5vSlc * * Also, the training and testing accuracies were: * num training images: 2217 * training results: 0.990738 0.736431 0.736073 * training upsampled results: 0.986837 0.937694 0.936912 * num testing images: 135 * testing results: 0.988827 0.471372 0.470806 * testing upsampled results: 0.987879 0.651132 0.650399 */ } } } } } } } catch (Exception e) { Console.WriteLine(e); } }
private static void Main() { try { // You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 // This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program. // As you can see, the file also includes a separately trained shape_predictor. To see // a generic example of how to train those refer to train_shape_predictor_ex.cpp. using (var deserialize = new ProxyDeserialize("mmod_rear_end_vehicle_detector.dat")) using (var net = LossMmod.Deserialize(deserialize, 1)) using (var sp = ShapePredictor.Deserialize(deserialize)) using (var img = Dlib.LoadImageAsMatrix <RgbPixel>("mmod_cars_test_image.jpg")) using (var win = new ImageWindow()) { win.SetImage(img); // Run the detector on the image and show us the output. var dets = net.Operator(img).First(); foreach (var d in dets) { // We use a shape_predictor to refine the exact shape and location of the detection // box. This shape_predictor is trained to simply output the 4 corner points of // the box. So all we do is make a rectangle that tightly contains those 4 points // and that rectangle is our refined detection position. var fd = sp.Detect(img, d); var rect = Rectangle.Empty; for (var j = 0u; j < fd.Parts; ++j) { rect += fd.GetPart(j); } win.AddOverlay(rect, new RgbPixel(255, 0, 0)); } Console.WriteLine("Hit enter to view the intermediate processing steps"); Console.ReadKey(); // Now let's look at how the detector works. The high level processing steps look like: // 1. Create an image pyramid and pack the pyramid into one big image. We call this // image the "tiled pyramid". // 2. Run the tiled pyramid image through the CNN. The CNN outputs a new image where // bright pixels in the output image indicate the presence of cars. // 3. Find pixels in the CNN's output image with a value > 0. Those locations are your // preliminary car detections. // 4. Perform non-maximum suppression on the preliminary detections to produce the // final output. // // We will be plotting the images from steps 1 and 2 so you can visualize what's // happening. For the CNN's output image, we will use the jet colormap so that "bright" // outputs, i.e. pixels with big values, appear in red and "dim" outputs appear as a // cold blue color. To do this we pick a range of CNN output values for the color // mapping. The specific values don't matter. They are just selected to give a nice // looking output image. const float lower = -2.5f; const float upper = 0.0f; Console.WriteLine($"jet color mapping range: lower={lower} upper={upper}"); // Create a tiled pyramid image and display it on the screen. // Get the type of pyramid the CNN used //using pyramid_type = std::remove_reference < decltype(input_layer(net)) >::type::pyramid_type; // And tell create_tiled_pyramid to create the pyramid using that pyramid type. using (var inputLayer = new InputRgbImagePyramid <PyramidDown>(6)) { net.TryGetInputLayer(inputLayer); var padding = inputLayer.GetPyramidPadding(); var outerPadding = inputLayer.GetPyramidOuterPadding(); Dlib.CreateTiledPyramid <RgbPixel, PyramidDown>(img, padding, outerPadding, 6, out var tiledImg, out var rects); using (var winpyr = new ImageWindow(tiledImg, "Tiled pyramid")) { // This CNN detector represents a sliding window detector with 3 sliding windows. Each // of the 3 windows has a different aspect ratio, allowing it to find vehicles which // are either tall and skinny, squarish, or short and wide. The aspect ratio of a // detection is determined by which channel in the output image triggers the detection. // Here we are just going to max pool the channels together to get one final image for // our display. In this image, a pixel will be bright if any of the sliding window // detectors thinks there is a car at that location. using (var subnet = net.GetSubnet()) { var output = subnet.Output; Console.WriteLine($"Number of channels in final tensor image: {output.K}"); var networkOutput = Dlib.ImagePlane(output); for (var k = 1; k < output.K; k++) { using (var tmpNetworkOutput = Dlib.ImagePlane(output, 0, k)) { var maxPointWise = Dlib.MaxPointWise(networkOutput, tmpNetworkOutput); networkOutput.Dispose(); networkOutput = maxPointWise; } } // We will also upsample the CNN's output image. The CNN we defined has an 8x // downsampling layer at the beginning. In the code below we are going to overlay this // CNN output image on top of the raw input image. To make that look nice it helps to // upsample the CNN output image back to the same resolution as the input image, which // we do here. var networkOutputScale = img.Columns / (double)networkOutput.Columns; Dlib.ResizeImage(networkOutput, networkOutputScale); // Display the network's output as a color image. using (var jet = Dlib.Jet(networkOutput, upper, lower)) using (var winOutput = new ImageWindow(jet, "Output tensor from the network")) { // Also, overlay network_output on top of the tiled image pyramid and display it. for (var r = 0; r < tiledImg.Rows; ++r) { for (var c = 0; c < tiledImg.Columns; ++c) { var tmp = new DPoint(c, r); tmp = Dlib.InputTensorToOutputTensor(net, tmp); var dp = networkOutputScale * tmp; tmp = new DPoint((int)dp.X, (int)dp.Y); if (Dlib.GetRect(networkOutput).Contains((int)tmp.X, (int)tmp.Y)) { var val = networkOutput[(int)tmp.Y, (int)tmp.X]; // alpha blend the network output pixel with the RGB image to make our // overlay. var p = new RgbAlphaPixel(); Dlib.AssignPixel(ref p, Dlib.ColormapJet(val, lower, upper)); p.Alpha = 120; var rgb = new RgbPixel(); Dlib.AssignPixel(ref rgb, p); tiledImg[r, c] = rgb; } } } // If you look at this image you can see that the vehicles have bright red blobs on // them. That's the CNN saying "there is a car here!". You will also notice there is // a certain scale at which it finds cars. They have to be not too big or too small, // which is why we have an image pyramid. The pyramid allows us to find cars of all // scales. using (var winPyrOverlay = new ImageWindow(tiledImg, "Detection scores on image pyramid")) { // Finally, we can collapse the pyramid back into the original image. The CNN doesn't // actually do this step, since it's enough to threshold the tiled pyramid image to get // the detections. However, it makes a nice visualization and clearly indicates that // the detector is firing for all the cars. using (var collapsed = new Matrix <float>(img.Rows, img.Columns)) using (var inputTensor = new ResizableTensor()) { inputLayer.ToTensor(img, 1, inputTensor); for (var r = 0; r < collapsed.Rows; ++r) { for (var c = 0; c < collapsed.Columns; ++c) { // Loop over a bunch of scale values and look up what part of network_output // corresponds to the point(c,r) in the original image, then take the max // detection score over all the scales and save it at pixel point(c,r). var maxScore = -1e30f; for (double scale = 1; scale > 0.2; scale *= 5.0 / 6.0) { // Map from input image coordinates to tiled pyramid coordinates. var tensorSpace = inputLayer.ImageSpaceToTensorSpace(inputTensor, scale, new DRectangle(new DPoint(c, r))); var tmp = tensorSpace.Center; // Now map from pyramid coordinates to network_output coordinates. var dp = networkOutputScale * Dlib.InputTensorToOutputTensor(net, tmp); tmp = new DPoint((int)dp.X, (int)dp.Y); if (Dlib.GetRect(networkOutput).Contains((int)tmp.X, (int)tmp.Y)) { var val = networkOutput[(int)tmp.Y, (int)tmp.X]; if (val > maxScore) { maxScore = val; } } } collapsed[r, c] = maxScore; // Also blend the scores into the original input image so we can view it as // an overlay on the cars. var p = new RgbAlphaPixel(); Dlib.AssignPixel(ref p, Dlib.ColormapJet(maxScore, lower, upper)); p.Alpha = 120; var rgb = new RgbPixel(); Dlib.AssignPixel(ref rgb, p); img[r, c] = rgb; } } using (var jet2 = Dlib.Jet(collapsed, upper, lower)) using (var winCollapsed = new ImageWindow(jet2, "Collapsed output tensor from the network")) using (var winImgAndSal = new ImageWindow(img, "Collapsed detection scores on raw image")) { Console.WriteLine("Hit enter to end program"); Console.ReadKey(); } } } } } } } } } catch (Exception e) { Console.WriteLine(e); } }
public static Matrix <double> TestObjectDetectionFunction <T>(LossMmod detector, IEnumerable <Matrix <T> > images, IEnumerable <IEnumerable <MModRect> > truthDets, TestBoxOverlap overlapTester = null, double adjustThreshold = 0, TestBoxOverlap overlapIgnoreTester = null) where T : struct { if (detector == null) { throw new ArgumentNullException(nameof(detector)); } if (images == null) { throw new ArgumentNullException(nameof(images)); } if (truthDets == null) { throw new ArgumentNullException(nameof(truthDets)); } detector.ThrowIfDisposed(); images.ThrowIfDisposed(); truthDets.ThrowIfDisposed(); var disposeOverlapTester = overlapTester == null; var disposeOverlapIgnoreTester = overlapIgnoreTester == null; try { if (disposeOverlapTester) { overlapTester = new TestBoxOverlap(); } if (disposeOverlapIgnoreTester) { overlapIgnoreTester = new TestBoxOverlap(); } using (var matrixVector = new StdVector <Matrix <T> >(images)) using (var disposer = new EnumerableDisposer <StdVector <MModRect> >(truthDets.Select(r => new StdVector <MModRect>(r)))) using (var detsVector = new StdVector <StdVector <MModRect> >(disposer.Collection)) using (new EnumerableDisposer <StdVector <MModRect> >(detsVector)) { var type = detector.NetworkType; Matrix <T> .TryParse <T>(out var elementTypes); var matrix = images.FirstOrDefault(); var ret = NativeMethods.test_object_detection_function_net(type, detector.NativePtr, elementTypes.ToNativeMatrixElementType(), matrixVector.NativePtr, matrix.TemplateRows, matrix.TemplateColumns, detsVector.NativePtr, overlapTester.NativePtr, adjustThreshold, overlapIgnoreTester.NativePtr, out var result); switch (ret) { case NativeMethods.ErrorType.MatrixElementTypeNotSupport: throw new ArgumentException($"{elementTypes} is not supported."); case NativeMethods.ErrorType.DnnNotSupportNetworkType: throw new NotSupportNetworkTypeException(type); } return(new Matrix <double>(result, 1, 3)); } } finally { if (disposeOverlapTester) { overlapTester?.Dispose(); } if (disposeOverlapIgnoreTester) { overlapIgnoreTester?.Dispose(); } } }
private static void Main(string[] args) { try { // In this example we are going to train a face detector based on the // small faces dataset in the examples/faces directory. So the first // thing we do is load that dataset. This means you need to supply the // path to this faces folder as a command line argument so we will know // where it is. if (args.Length != 1) { Console.WriteLine("Give the path to the examples/faces directory as the argument to this"); Console.WriteLine("program. For example, if you are in the examples folder then execute "); Console.WriteLine("this program by running: "); Console.WriteLine(" ./dnn_mmod_ex faces"); return; } var facesDirectory = args[0]; // The faces directory contains a training dataset and a separate // testing dataset. The training data consists of 4 images, each // annotated with rectangles that bound each human face. The idea is // to use this training data to learn to identify human faces in new // images. // // Once you have trained an object detector it is always important to // test it on data it wasn't trained on. Therefore, we will also load // a separate testing set of 5 images. Once we have a face detector // created from the training data we will see how well it works by // running it on the testing images. // // So here we create the variables that will hold our dataset. // images_train will hold the 4 training images and face_boxes_train // holds the locations of the faces in the training images. So for // example, the image images_train[0] has the faces given by the // rectangles in face_boxes_train[0]. IList <Matrix <RgbPixel> > imagesTrain; IList <Matrix <RgbPixel> > imagesTest; IList <IList <MModRect> > faceBoxesTrain; IList <IList <MModRect> > faceBoxesTest; // Now we load the data. These XML files list the images in each dataset // and also contain the positions of the face boxes. Obviously you can use // any kind of input format you like so long as you store the data into // images_train and face_boxes_train. But for convenience dlib comes with // tools for creating and loading XML image datasets. Here you see how to // load the data. To create the XML files you can use the imglab tool which // can be found in the tools/imglab folder. It is a simple graphical tool // for labeling objects in images with boxes. To see how to use it read the // tools/imglab/README.txt file. Dlib.LoadImageDataset(facesDirectory + "/training.xml", out imagesTrain, out faceBoxesTrain); Dlib.LoadImageDataset(facesDirectory + "/testing.xml", out imagesTest, out faceBoxesTest); Console.WriteLine($"num training images: {imagesTrain.Count()}"); Console.WriteLine($"num testing images: {imagesTest.Count()}"); // The MMOD algorithm has some options you can set to control its behavior. However, // you can also call the constructor with your training annotations and a "target // object size" and it will automatically configure itself in a reasonable way for your // problem. Here we are saying that faces are still recognizably faces when they are // 40x40 pixels in size. You should generally pick the smallest size where this is // true. Based on this information the mmod_options constructor will automatically // pick a good sliding window width and height. It will also automatically set the // non-max-suppression parameters to something reasonable. For further details see the // mmod_options documentation. using (var options = new MModOptions(faceBoxesTrain, 40, 40)) { // The detector will automatically decide to use multiple sliding windows if needed. // For the face data, only one is needed however. var detectorWindows = options.DetectorWindows.ToArray(); Console.WriteLine($"num detector windows: {detectorWindows.Length}"); foreach (var w in detectorWindows) { Console.WriteLine($"detector window width by height: {w.Width} x {w.Height}"); } Console.WriteLine($"overlap NMS IOU thresh: {options.OverlapsNms.GetIouThresh()}"); Console.WriteLine($"overlap NMS percent covered thresh: {options.OverlapsNms.GetPercentCoveredThresh()}"); // Now we are ready to create our network and trainer. using (var net = new LossMmod(options, 2)) { // The MMOD loss requires that the number of filters in the final network layer equal // options.detector_windows.size(). So we set that here as well. using (var subnet = net.GetSubnet()) using (var details = subnet.GetLayerDetails()) { details.SetNumFilters(detectorWindows.Length); using (var trainer = new DnnTrainer <LossMmod>(net)) { trainer.SetLearningRate(0.1); trainer.BeVerbose(); trainer.SetSynchronizationFile("mmod_sync", 5 * 60); trainer.SetIterationsWithoutProgressThreshold(300); // Now let's train the network. We are going to use mini-batches of 150 // images. The images are random crops from our training set (see // random_cropper_ex.cpp for a discussion of the random_cropper). IEnumerable <Matrix <RgbPixel> > miniBatchSamples; //IEnumerable<IEnumerable<RgbPixel>> mini_batch_labels; IEnumerable <IEnumerable <MModRect> > miniBatchLabels; using (var cropper = new RandomCropper()) using (var chipDims = new ChipDims(200, 200)) { cropper.ChipDims = chipDims; // Usually you want to give the cropper whatever min sizes you passed to the // mmod_options constructor, which is what we do here. cropper.SetMinObjectSize(40, 40); using (var rnd = new Rand()) { // Run the trainer until the learning rate gets small. This will probably take several // hours. while (trainer.GetLearningRate() >= 1e-4) { cropper.Operator(150, imagesTrain, faceBoxesTrain, out miniBatchSamples, out miniBatchLabels); // We can also randomly jitter the colors and that often helps a detector // generalize better to new images. foreach (var img in miniBatchSamples) { Dlib.DisturbColors(img, rnd); } LossMmod.TrainOneStep(trainer, miniBatchSamples, miniBatchLabels); miniBatchSamples.DisposeElement(); miniBatchLabels.DisposeElement(); } // wait for training threads to stop trainer.GetNet(); Console.WriteLine("done training"); // Save the network to disk net.Clean(); LossMmod.Serialize(net, "mmod_network.dat"); // Now that we have a face detector we can test it. The first statement tests it // on the training data. It will print the precision, recall, and then average precision. // This statement should indicate that the network works perfectly on the // training data. using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTrain, faceBoxesTrain)) Console.WriteLine($"training results: {matrix}"); // However, to get an idea if it really worked without overfitting we need to run // it on images it wasn't trained on. The next line does this. Happily, // this statement indicates that the detector finds most of the faces in the // testing data. using (var matrix = Dlib.TestObjectDetectionFunction(net, imagesTest, faceBoxesTest)) Console.WriteLine($"testing results: {matrix}"); // If you are running many experiments, it's also useful to log the settings used // during the training experiment. This statement will print the settings we used to // the screen. Console.WriteLine($"{trainer}{cropper}"); // Now lets run the detector on the testing images and look at the outputs. using (var win = new ImageWindow()) foreach (var img in imagesTest) { Dlib.PyramidUp(img); var dets = net.Operator(img); win.ClearOverlay(); win.SetImage(img); foreach (var d in dets[0]) { win.AddOverlay(d); } Console.ReadKey(); foreach (var det in dets) { foreach (var d in det) { d.Dispose(); } } } // Now that you finished this example, you should read dnn_mmod_train_find_cars_ex.cpp, // which is a more advanced example. It discusses many issues surrounding properly // setting the MMOD parameters and creating a good training dataset. } } } } } detectorWindows.DisposeElement(); } } catch (Exception e) { Console.WriteLine(e); } }