private void doExport(string fpath, string opath) { var sample = new ClassifiedSample <string>(); using (var srcFile = File.Open(fpath, FileMode.Open, FileAccess.Read)) using (var srcReader = new StreamReader(srcFile)) { var line = srcReader.ReadLine(); var segs = line.Split(SEPARATOR, StringSplitOptions.RemoveEmptyEntries); var cls = m_Classes[segs[0]]; var doc = segs[1]; sample.Add(doc, cls); } var vocabulary = Alg.ExtractVocabulary(sample); var dim = vocabulary.Count; var builder = new StringBuilder(); using (var outFile = File.Open(opath, FileMode.CreateNew, FileAccess.Write)) using (var outWriter = new StreamWriter(outFile)) { for (int i = 0; i < dim; i++) { builder.AppendFormat("{0},", vocabulary[i]); } builder.Append("_class,_value,_training"); outWriter.WriteLine(builder.ToString()); foreach (var pData in sample) { var doc = pData.Key; var cls = pData.Value; bool isEmpty; var data = Alg.ExtractFeatureVector(doc, out isEmpty); if (isEmpty) { continue; } builder.Clear(); for (int i = 0; i < dim; i++) { builder.AppendFormat("{0},", data[i]); } builder.AppendFormat("{0},{1},{2}", cls.Name, cls.Value, 1); outWriter.WriteLine(builder.ToString()); } } }
private DecisionNode <TObj> trainID3Core(IEnumerable <Predicate <TObj> > patterns, ClassifiedSample <TObj> sample, IInformativityIndex <TObj> informativity) { if (!sample.Any()) { throw new MLException("Empty sample"); } var cls = sample.First().Value; if (sample.All(kvp => kvp.Value.Equals(cls))) { return(new LeafNode <TObj>(cls)); } var pattern = informativity.Max(patterns, sample); var negSample = new ClassifiedSample <TObj>(); var posSample = new ClassifiedSample <TObj>(); foreach (var pData in sample) { if (pattern(pData.Key)) { posSample.Add(pData.Key, pData.Value); } else { negSample.Add(pData.Key, pData.Value); } } if (!negSample.Any() || !posSample.Any()) { var majorClass = sample.GroupBy(pd => pd.Value) .Select(g => new KeyValuePair <Class, int>(g.Key, g.Count())) .OrderByDescending(c => c.Value) .First(); return(new LeafNode <TObj>(majorClass.Key)); } var node = new InnerNode <TObj>(pattern); var negNode = trainID3Core(patterns, negSample, informativity); var posNode = trainID3Core(patterns, posSample, informativity); node.SetNegativeNode(negNode); node.SetPositiveNode(posNode); return(node); }
private void loadSample(string ipath, string lpath, ClassifiedSample <double[][, ]> sample) { using (var ifile = File.Open(ipath, FileMode.Open, FileAccess.Read)) using (var lfile = File.Open(lpath, FileMode.Open, FileAccess.Read)) { var header = ReadInt32BigEndian(ifile); if (header != 2051) { throw new Exception("Incorrect MNIST image datafile"); } header = ReadInt32BigEndian(lfile); if (header != 2049) { throw new Exception("Incorrect MNIST label datafile"); } var count = ReadInt32BigEndian(ifile); var rows = ReadInt32BigEndian(ifile); var cols = ReadInt32BigEndian(ifile); ReadInt32BigEndian(lfile); for (int q = 0; q < count; q++) { var data = new double[1][, ] { new double[rows, cols] }; for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { var shade = ifile.ReadByte(); // do not invert 255-* because we want to keep logical format: 0=white, 255=black - not image color format! data[0][i, j] = shade / 255.0D; } } var label = lfile.ReadByte(); sample.Add(data, m_Classes[label]); } Console.WriteLine("Loaded: {0}", ipath); Console.WriteLine("Loaded: {0}", lpath); } }
private void loadSample(string[] fpaths, ClassifiedSample <double[][, ]> sample) { foreach (var fpath in fpaths) { using (var file = File.Open(fpath, FileMode.Open, FileAccess.Read)) { while (true) { var label = file.ReadByte(); if (label < 0) { break; } Class cls; if (!m_Classes.TryGetValue(label, out cls)) { file.Seek(3 * 32 * 32, SeekOrigin.Current); continue; } var data = new double[3][, ]; data[0] = new double[32, 32]; data[1] = new double[32, 32]; data[2] = new double[32, 32]; for (int d = 0; d < 3; d++) { for (int y = 0; y < 32; y++) { for (int x = 0; x < 32; x++) { data[d][y, x] = file.ReadByte() / 255.0D; } } } sample.Add(data, cls); } } } }
private void loadTrain(string path, string lpath, ClassifiedSample <double[][, ]> sample) { sample.Clear(); using (var lfile = File.Open(lpath, FileMode.Open, FileAccess.Read)) using (var reader = new StreamReader(lfile)) { reader.ReadLine(); // read label file header var dir = new DirectoryInfo(path); foreach (var file in dir.EnumerateFiles()) { var data = loadFile(file.FullName); var clsName = reader.ReadLine().Split(',')[1]; var cls = m_Classes.First(c => c.Value.Name.Equals(clsName)).Value; sample.Add(data, cls); } } Console.WriteLine("Loaded files from: {0}", path); }
private void loadSample(string ipath, ClassifiedSample <double[][, ]> sample) { sample.Clear(); using (var ifile = File.Open(ipath, FileMode.Open, FileAccess.Read)) using (var reader = new StreamReader(ifile)) { var header = reader.ReadLine(); while (true) { var str = reader.ReadLine(); if (string.IsNullOrWhiteSpace(str)) { break; } var raw = str.Split(',') .Select(d => int.Parse(d)) .ToArray(); var label = raw[0]; var data = new double[1][, ] { new double[IMG_SIZE, IMG_SIZE] }; for (int i = 1; i <= IMG_SIZE * IMG_SIZE; i++) { var shade = raw[i]; // do not invert 255-* because we want to keep logical format: 0=white, 255=black - not image color format! var x = (i - 1) % IMG_SIZE; var y = (i - 1) / IMG_SIZE; data[0][y, x] = shade / 255.0D; } sample.Add(data, m_Classes[label]); } Console.WriteLine("Loaded: {0}", ipath); } }
private void loadSample(string[] fpaths, ClassifiedSample <double[][, ]> sample) { foreach (var fpath in fpaths) { using (var file = File.Open(fpath, FileMode.Open, FileAccess.Read)) { while (true) { var label = file.ReadByte(); if (label < 0) { break; } var cls = m_Classes[label]; var data = new double[3][, ]; data[0] = new double[32, 32]; data[1] = new double[32, 32]; data[2] = new double[32, 32]; for (int d = 0; d < 3; d++) { for (int y = 0; y < 32; y++) { for (int x = 0; x < 32; x++) { data[d][y, x] = file.ReadByte() / 255.0D; } } } sample.Add(data, cls); } } } }
private void readBody(StreamReader reader, int[] featureIndxs, int trainingIndx, int classesIndx, int clsValIdx) { var dim = featureIndxs.Length; var lineNum = 0; while (true) { lineNum++; var line = reader.ReadLine(); if (line != null && line.StartsWith("//")) { continue; } if (string.IsNullOrWhiteSpace(line)) { break; } var data = line.Split(','); var success = true; var point = new double[dim]; for (var i = 0; i < dim; i++) { double result; var ftIdx = featureIndxs[i]; if (!double.TryParse(data[ftIdx], out result)) { success = false; break; } point[i] = result; } if (!success) { Errors.Add(new DataError { LineNum = lineNum, Line = line }); continue; } Class cls; var clsName = data[classesIndx]; if (!Classes.TryGetValue(clsName, out cls)) { double val; var value = (clsValIdx < 0 || !double.TryParse(data[clsValIdx], out val)) ? (double?)null : val; cls = new Class(clsName, value); Classes[clsName] = cls; } Data.Add(point, cls); if (trainingIndx >= 0) { var isTraining = int.Parse(data[trainingIndx]) != 0; if (isTraining) { TrainingSample.Add(point, cls); } } } }