private static void PrintTreeAsRules(StringBuilder sb, ref StringBuilder sbMaxPositive, ref StringBuilder sbMaxNegative, ref int maxPositive, ref int maxNegative, Id3Node tree, ArffHeader header) { if (tree.IsLeaf) { Id3Node leaf = tree; StringBuilder localSB = new StringBuilder(); localSB.AppendLine("Rule is:"); localSB.AppendLine(); int count = tree.Parent.ValueClassCounts[tree.ParentValue].Values.Sum(); while (tree.Parent != null) { string value = tree.ParentValue == -1 ? "?" : ((ArffNominalAttribute)header.Attributes.ElementAt(tree.Parent.AttributeIndex).Type).Values[tree.ParentValue]; localSB.Append($"<{header.Attributes.ElementAt(tree.Parent.AttributeIndex).Name}> equals to <{value}>"); localSB.Append(" and "); tree = tree.Parent; } localSB.AppendLine(); localSB.AppendLine("---------------------------------------"); if (leaf.Class == 0) { sb.Append(localSB.ToString()); if (count > maxPositive) { maxPositive = count; sbMaxPositive = localSB; sbMaxPositive.AppendLine($"COUNT: {count}"); } } else { if (count > maxNegative) { maxNegative = count; sbMaxNegative = localSB; sbMaxNegative.AppendLine($"COUNT: {count}"); } } } else { foreach (KeyValuePair <int, Id3Node> kvp in tree.Children) { PrintTreeAsRules(sb, ref sbMaxPositive, ref sbMaxNegative, ref maxPositive, ref maxNegative, kvp.Value, header); } } }
private void WriteArffFile(string[] allfiles) { using (ArffWriter arffWriter = new ArffWriter(textBox3.Text + ".arff")) { bool relation = true; object[] inst = null; object[][] insts = new object[allfiles.Length][]; int n = 0; foreach (string file in allfiles) { FileInfo fi = new FileInfo(file); ArffReader datafile = new ArffReader(file); ArffHeader header = datafile.ReadHeader(); var instances = datafile.ReadAllInstances(); //var instance = datafile.ReadInstance(); object[] instance = null; foreach (var ins in instances) { instance = ins; } if (header != null) { var attributes = header.Attributes; string line; StreamReader attrfile = new StreamReader("attributes.txt"); int i = 0; int j = 0; while ((line = attrfile.ReadLine()) != null) { foreach (var attribute in attributes) { if (attribute.Name == line) { if (relation == true) { arffWriter.WriteRelationName(header.RelationName); relation = false; } arffWriter.WriteAttribute(new ArffAttribute(attribute.Name, attribute.Type)); j++; } i++; } } inst = new object[j]; j = 0; i = 0; while ((line = attrfile.ReadLine()) != null) { foreach (var attribute in attributes) { if (attribute.Name == line) { inst.SetValue(instance.GetValue(i), j); j++; } i++; } } insts.SetValue(inst, n); attrfile.Close(); } datafile.Dispose(); n++; progress++; } foreach (var ins in insts) { arffWriter.WriteInstance(new object[] { ins }); } } }
static void Main(string[] args) { // Training ArffHeader header = null; List <object[]> instances = new List <object[]>(); using (ArffReader arffReader = new ArffReader(TrainingArffFile)) { header = arffReader.ReadHeader(); object[] instance; while ((instance = arffReader.ReadInstance()) != null) { instances.Add(instance); } } List <int[]> trainingData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray())); // Test instances = new List <object[]>(); using (ArffReader arffReader = new ArffReader(TestArffFile)) { header = arffReader.ReadHeader(); object[] instance; while ((instance = arffReader.ReadInstance()) != null) { instances.Add(instance); } } List <int[]> testData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray())); Console.WriteLine("Number of Samples, training accuracy, test accuracy"); // Do this excercise multiple times as sampling is random. Parallel.For(0, 100, (k) => { // Dictionaries to store results for different accuracies. KEEP IN SYNC :) ConcurrentDictionary <int, double> sampleTrainingAccuraciesMap = new ConcurrentDictionary <int, double>(new Dictionary <int, double> { { 1, 0 }, { 3, 0 }, { 5, 0 }, { 10, 0 }, { 20, 0 }, { 25, 0 }, { 50, 0 }, { 75, 0 }, { 100, 0 } }); ConcurrentDictionary <int, double> sampleTestAccuraciesMap = new ConcurrentDictionary <int, double>(new Dictionary <int, double> { { 1, 0 }, { 3, 0 }, { 5, 0 }, { 10, 0 }, { 20, 0 }, { 25, 0 }, { 50, 0 }, { 75, 0 }, { 100, 0 } }); // Calculate different sample accuracies in parallel. Parallel.ForEach(sampleTrainingAccuraciesMap.Keys, numOfSamples => { List <Id3Classifier> classifiers = new List <Id3Classifier>(); Sampler sampler = new Sampler(trainingData, numOfSamples); for (int i = 0; i < numOfSamples; i++) { Id3Classifier classifier = new Id3Classifier(sampler.Samples[i], ClassIndex, Confidence); classifiers.Add(classifier); } // Evaluate training and test to look out for overfitting. sampleTrainingAccuraciesMap[numOfSamples] = Evaluate(trainingData, classifiers); sampleTestAccuraciesMap[numOfSamples] = Evaluate(testData, classifiers); }); lock (_lockConsole) { foreach (int numOfSamples in sampleTestAccuraciesMap.Keys.OrderBy(n => n)) { Console.WriteLine($"{numOfSamples},{sampleTrainingAccuraciesMap[numOfSamples]},{sampleTestAccuraciesMap[numOfSamples]}"); } } }); Console.WriteLine("Press ENTER to exit..."); Console.ReadLine(); }
static void Main(string[] args) { // Training ArffHeader header = null; List <object[]> instances = new List <object[]>(); using (ArffReader arffReader = new ArffReader(_arffFile)) { header = arffReader.ReadHeader(); object[] instance; while ((instance = arffReader.ReadInstance()) != null) { instances.Add(instance); } } List <int[]> trainingData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray())); // Test instances = new List <object[]>(); using (ArffReader arffReader = new ArffReader(_testArffFile)) { header = arffReader.ReadHeader(); object[] instance; while ((instance = arffReader.ReadInstance()) != null) { instances.Add(instance); } } List <int[]> testData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray())); double[] confidences = new double[] { 0.0, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.9999 }; PrintAsCsv(header, trainingData, @"c:\users\andresz\desktop\data.csv"); Parallel.ForEach(confidences, confidence => { Id3Node tree = Id3Node.BuildTree(trainingData, trainingData[0].Length - 1, confidence); Console.WriteLine($"Confidence {confidence}: Num of nodes {GetCount(tree)}"); // Test accuracy on training Console.WriteLine($"Confidence {confidence}: Accuracy on train = { trainingData.Where(instance => GetClass(instance, tree) == instance[trainingData[0].Length - 1]).Count() / (double)trainingData.Count}"); // Test accuracy on test Console.WriteLine($"Confidence {confidence}: Accuracy on test = { testData.Where(instance => GetClass(instance, tree) == instance[testData[0].Length - 1]).Count() / (double)testData.Count}"); StringBuilder sb = new StringBuilder(); StringBuilder sbMaxPositive = new StringBuilder(); StringBuilder sbMaxNegative = new StringBuilder(); int maxPositive = int.MinValue; int maxNegative = int.MinValue; // Only print small trees. if (confidence > 0.5) { PrintTreeAsRules(sb, ref sbMaxPositive, ref sbMaxNegative, ref maxPositive, ref maxNegative, tree, header); sb.AppendLine("The most max positive rule is:"); sb.AppendLine(sbMaxPositive.ToString()); sb.AppendLine(); sb.AppendLine("The most max negative rule is:"); sb.AppendLine(sbMaxNegative.ToString()); Directory.CreateDirectory(_outputFolder); File.WriteAllText(Path.Combine(_outputFolder, $"Tree{confidence}.txt"), sb.ToString()); } }); }
private void Learn() { using (ArffReader arffReader = new ArffReader(filePath)) { header = arffReader.ReadHeader(); attributes = new Dictionary <string, AttributeListInfo>(); //attribute name, attribute info attributeList = header.Attributes.ToList(); foreach (var attr in attributeList) { string[] array = attr.Type.ToString().Split(','); if (decisionVariableType == AttributeType.Discrete && (array[0].ToString() == "numeric" || array[0].ToString() == "real" || array.Length == 1)) { throw new InvalidOperationException("Variable Type Should be Continous, not Discrete"); } List <string> myList = new List <string>(); foreach (string s in array) { //if s contains real, do decision tree learning for real values myList.Add(s.Replace("{", "").Replace("}", "").Trim()); } AttributeListInfo myAttributeInfo = new AttributeListInfo(); //define information for each attribute, then add to dictionary myAttributeInfo.NumberOfInputs = myList.Count; //array.Length myAttributeInfo.Inputs = myList; attributes.Add(attr.Name, myAttributeInfo); if (_attributes.Count < attributeList.Count - 1) { _attributes.Add(new Attribute(attr.Name, myAttributeInfo.NumberOfInputs)); queryList.Add(new Query(attr.Name, myList)); } else { lastAttrName = attr.Name; } } List <double[]> inputs2 = new List <double[]>(); List <double[]> inputsAll = new List <double[]>(); List <int> outputs2 = new List <int>(); object[] instance; while ((instance = arffReader.ReadInstance()) != null) { int Length = instance.Length; double[] newRow = new double[Length - 1]; double[] row = new double[Length]; for (int i = 0; i < Length - 1; i++) { Double.TryParse(instance[i] + "", out newRow[i]); } for (int i = 0; i < Length; i++) { Double.TryParse(instance[i] + "", out row[i]); } inputs2.Add(newRow); inputsAll.Add(row); outputs2.Add(Int32.Parse(instance[Length - 1] + "")); } dInputs = inputs2.ToArray(); //inputs (expect the last index) allDInputs = inputsAll.ToArray(); //all double inputs outputs = outputs2.ToArray(); //decide what algorithm to use based on options. if (learningAlgorithm == LearningAlgorithm.C45Learning && decisionVariableType == AttributeType.Continuous) { c45 = new C45Algorithm(); var stopwatch = new Stopwatch(); stopwatch.Start(); tree = c45.Learn(dInputs, outputs); //induce tree from data stopwatch.Stop(); elapsedTime = stopwatch.ElapsedMilliseconds; } else if (learningAlgorithm == LearningAlgorithm.C45Learning && decisionVariableType == AttributeType.Both) { c45 = new C45Algorithm(); var stopwatch = new Stopwatch(); stopwatch.Start(); tree = c45.Learn(dInputs, outputs); //induce tree from data stopwatch.Stop(); elapsedTime = stopwatch.ElapsedMilliseconds; } else if (learningAlgorithm == LearningAlgorithm.C45Learning && decisionVariableType == AttributeType.Discrete) { c45 = new C45Algorithm(_attributes.ToArray()); var stopwatch = new Stopwatch(); stopwatch.Start(); tree = c45.Learn(dInputs, outputs); //induce tree from data stopwatch.Stop(); elapsedTime = stopwatch.ElapsedMilliseconds; } rules = tree.ToRules(attributes, decisionVariableType); predicted = tree.ComputeDecision(dInputs); } }