public static void UnpackByteFileToImages(string lablePath, string imagePath, string dest, bool resaveByteFiles, bool saveImages, bool orderbylabel) { //create 10 directories under the specific root dir DirGenerator.CreateDirs(StringResources.Digits.Select(x => x.ToString()).ToList(), dest); //create stream and reader for label and image byte file reading var labelStream = new FileStream(lablePath, FileMode.Open, FileAccess.Read); var labelReader = new BinaryReader(labelStream); var imageStream = new FileStream(imagePath, FileMode.Open, FileAccess.Read); var imageReader = new BinaryReader(imageStream); //verify label file var labelFileLength = labelStream.Length; var lmagic = labelReader.ReadInt32BigEndian(); var lnum = labelReader.ReadInt32BigEndian(); if (labelFileLength != 60008 || lmagic != 2049 || lnum != 60000) { throw new ArgumentException("label byte file corrupted"); } //verify image file int magic = imageReader.ReadInt32BigEndian(); int numImages = imageReader.ReadInt32BigEndian(); int numRows = imageReader.ReadInt32BigEndian(); int numCols = imageReader.ReadInt32BigEndian(); if (magic != 2051 || numImages != 60000 || numRows != 28 || numCols != 28) { throw new ArgumentException("image byte file corrupted"); } //read and save images to hard disk var imageDatas = new List <ImageData>(); try { while (true) { var imageData = imageReader.ReadAsImage(28, 28); int label = labelReader.ReadByte(); imageData.Label = label.ToString(); imageDatas.Add(imageData); } } catch (EndOfStreamException) { } catch (Exception e) { Console.WriteLine(e); } labelReader.Close(); imageReader.Close(); if (saveImages) { foreach (var imageData in imageDatas) { var labelDir = Path.Combine(dest, imageData.Label); var imgPath = $"{labelDir}\\{Guid.NewGuid().ToString()}.jpg"; imageData.bitmap.Save(imgPath); } } if (resaveByteFiles) { //write bytes to label file var labelWriter = new BinaryWriter(new FileStream("mnistlabel", FileMode.Create)); var imageDataWriter = new BinaryWriter(new FileStream("mnistimage", FileMode.Create)); if (orderbylabel) { imageDatas = imageDatas.OrderBy(x => Convert.ToInt32(x.Label)).ToList(); } ByteFileGenerator.WriteDataToFile(imageDatas, labelWriter, imageDataWriter, 60000, 28); } }
//normallly we run command 2 then command 1, when byte file for image and label are generated //move those files to ML solution static void Main(string[] args) { //for each character, we have around 10000 images, either synthesized ones or dummy copies. //our ML solution does not randomize the order of the images, therefore it is vital that we shuffle them //already in this project Console.WriteLine("generate a byte file containing all image byte data by pointing to an image source - 1"); //we are not sure if the synthesize works, but it is supported. if synthesize is not used, in order to gain enough //training examples, we are going to make copies of original ones. The images are organized by font name, not by //character identity, we are going to resize them into 28*28 and reorganize them into folder by character identity, //then for each image, either we synthesize it or or we make some copies of it (if we are using windows font for testing, 14 fonts, 1000 copies are enougth) Console.WriteLine("synthesize images / make copies of original ones - 2"); //the ML pipleline takes image and label byte files generated by us as input //in order to make sure our own byte files are in correct format, we unpack the original //"..\\..\\Data\\train-labels-idx1-ubyte", "..\\..\\Data\\train-images-idx3-ubyte" files into //normal image files. When we view the actual images, we can be sure about the processing //generating the byte files for image/label Console.WriteLine("unpack mnist byte files into images - 3"); LabelConfig labelConfig = null; if (_twoLetters.Equals("1")) { labelConfig = new TwoLetterConfig(); } else { var allSingleChars = ConfigurationManager.AppSettings["GeneratedChars"]; labelConfig = new SingleLetterConfig(allSingleChars); } var command = Console.ReadLine(); if (command.Equals("1")) { ByteFileGenerator.GenerateByteFile(_fontDataDirDest, size, labelConfig); } else if (command.Equals("2")) { Console.WriteLine("we are going to normalize the data and put them into folder with char name, do you want to spawn 5 synthesized images per image? (false/true)"); var shouldSynthesize = Convert.ToBoolean(Console.ReadLine()); Console.WriteLine("generate more copies of original image? type of number, if 1 no copy is saved, if larger than 1 then N-1 more copies will be saved"); var copy = Convert.ToInt32(Console.ReadLine()); //the images feed into ML solution has black background Console.WriteLine("should make background black? (true/false)"); var colorInvert = Convert.ToBoolean(Console.ReadLine()); //create dirs for the chars or two letters labelConfig.LabelDatas.ForEach(ld => { DirGenerator.CreateDir(ld.Label, _fontDataDirDest); }); Directory.CreateDirectory(_fontDataDir); Directory.CreateDirectory(_fontDataDirDest); ImageGenerator.GenerateImages(_fontDataDir, _fontDataDirDest, shouldSynthesize, synthesizeCount, copy, colorInvert, size); } else if (command.Equals("3")) { Console.WriteLine("read byte data from MNIST files, then output new byte files?"); bool resaveByteFile = Convert.ToBoolean(Console.ReadLine()); Console.WriteLine("save images to hard disk?"); bool saveImages = Convert.ToBoolean(Console.ReadLine()); bool orderbylabel = false; if (resaveByteFile) { Console.WriteLine("order by label?"); orderbylabel = Convert.ToBoolean(Console.ReadLine()); } MNISTUnpack.UnpackByteFileToImages("..\\..\\Data\\train-labels-idx1-ubyte", "..\\..\\Data\\train-images-idx3-ubyte", _mnistDir, resaveByteFile, saveImages, orderbylabel); } }