Beispiel #1
0
 /// <summary>
 /// Reads an arff file and gets all the headers (<see cref="ArffHeader"/>) and instances (<see cref="object[][]"/>).
 /// </summary>
 /// <param name="arff">Arff file name</param>
 private void ReadArff(string arff)
 {
     Header    = null;
     Instances = null;
     try
     {
         if (File.Exists(arff))
         {
             using (ArffReader arffReader = new ArffReader(arff))
             {
                 Header    = arffReader.ReadHeader();
                 Instances = arffReader.ReadAllInstances();
             }
         }
         else
         {
             throw new FileNotFoundException(string.Format("File \"{0}\" does not exist.", arff), arff);
         }
     }
     catch (Exception e)
     {
         if (Verbose)
         {
             throw new Exception(string.Format("Arff file could not be read: {0}\n{1}", e.Message, e.StackTrace), e);
         }
         else
         {
             throw new Exception(string.Format("Arff file could not be read: {0}", e.Message), e);
         }
     }
 }
Beispiel #2
0
        public void CRLineEndings()
        {
            string arff = "@relation relationName\r@attribute a1 numeric\r@data";

            ArffReader arffReader = CreateArffReader(arff);

            ArffHeader arffHeader = arffReader.ReadHeader();
        }
Beispiel #3
0
        public void MissingAttributes()
        {
            string arff = @"@relation relationName
                            @data";

            ArffReader arffReader = CreateArffReader(arff);

            ArffHeader arffHeader = arffReader.ReadHeader();
        }
Beispiel #4
0
        public void MissingRelationName()
        {
            string arff = @"@attribute a1 numeric
                            @data";

            ArffReader arffReader = CreateArffReader(arff);

            ArffHeader arffHeader = arffReader.ReadHeader();
        }
Beispiel #5
0
        public void ReadInstanceBeforeHeader()
        {
            string arff = @"@relation relationName
                            @attribute a1 numeric
                            @data";

            ArffReader arffReader = CreateArffReader(arff);

            arffReader.ReadInstance();
        }
Beispiel #6
0
        public void KeywordCaseInsensitivity()
        {
            string arff = @"@RElatION relationName
                            @ATTRiBUTE a1 nuMERic
                            @DaTa";

            ArffReader arffReader = CreateArffReader(arff);

            ArffHeader arffHeader = arffReader.ReadHeader();
        }
Beispiel #7
0
        public void MissingData()
        {
            string arff = @"@relation relationName
                            @attribute a1 numeric
                            ";

            ArffReader arffReader = CreateArffReader(arff);

            ArffHeader arffHeader = arffReader.ReadHeader();
        }
Beispiel #8
0
        public void NoInstances()
        {
            string arff = @"@relation relationName
                            @attribute a1 numeric
                            @data";

            ArffReader arffReader = CreateArffReader(arff);

            arffReader.ReadHeader();

            Assert.IsNull(arffReader.ReadInstance());
            Assert.IsNull(arffReader.ReadInstance());
        }
Beispiel #9
0
        public void InstanceWeightsReadCorrectly()
        {
            string arff = @"@relation relationName
                            @attribute a1 numeric
                            @attribute a2 integer
                            @attribute a3 real
                            @data
                            -6.54,42,0.0
                            -6.54,42,0.0,{5}
                            -6.54,42,0.0,{0.476}
                            {0 -6.54,1 42}
                            {0 -6.54,1 42},{0.476}";

            ArffReader arffReader = CreateArffReader(arff);

            arffReader.ReadHeader();


            arffReader.ReadInstance(out double?instanceWeight);

            Assert.IsNull(instanceWeight);

            arffReader.ReadInstance(out instanceWeight);

            Assert.AreEqual(5.0, instanceWeight);

            arffReader.ReadInstance(out instanceWeight);

            Assert.AreEqual(0.476, instanceWeight);

            arffReader.ReadInstance(out instanceWeight);

            Assert.IsNull(instanceWeight);

            arffReader.ReadInstance(out instanceWeight);

            Assert.AreEqual(0.476, instanceWeight);
        }
Beispiel #10
0
        private void AssertReader(string arff, string expectedRelationName = null, ICollection expectedAttributes = null, object[][] expectedInstances = null)
        {
            ArffReader arffReader = CreateArffReader(arff);

            ArffHeader arffHeader = arffReader.ReadHeader();

            if (expectedRelationName != null)
            {
                Assert.AreEqual(expectedRelationName, arffHeader.RelationName, "Unexpected relation name.");
            }

            if (expectedAttributes != null)
            {
                CollectionAssert.AreEqual(expectedAttributes, arffHeader.Attributes, "Unexpected attributes.");
            }

            object[][] instances = arffReader.ReadAllInstances();

            if (expectedInstances != null)
            {
                CollectionAssert.AreEqual(expectedInstances, instances, new InstanceComparer(), "Unexpected instances.");
            }
        }
Beispiel #11
0
        static void Main(string[] args)
        {
            // Training
            ArffHeader      header    = null;
            List <object[]> instances = new List <object[]>();

            using (ArffReader arffReader = new ArffReader(_trainingArffFile))
            {
                header = arffReader.ReadHeader();
                object[] instance;
                while ((instance = arffReader.ReadInstance()) != null)
                {
                    instances.Add(instance);
                }
            }

            List <int[]> trainingData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray()));

            // Test
            instances = new List <object[]>();
            using (ArffReader arffReader = new ArffReader(_testArffFile))
            {
                header = arffReader.ReadHeader();
                object[] instance;
                while ((instance = arffReader.ReadInstance()) != null)
                {
                    instances.Add(instance);
                }
            }

            List <int[]> testData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray()));

            double[] confidences = new double[]
            {
                0.0,
                0.1,
                0.2,
                0.4,
                0.6,
                0.8,
                0.9,
                0.95,
                0.99,
                0.9999
            };

            PrintAsCsv(header, trainingData, _pathToOutputCsv);

            Parallel.ForEach(confidences, confidence =>
            {
                Id3Classifier classifier = new Id3Classifier(trainingData, _classIndex, confidence);

                Console.WriteLine($"Confidence {confidence}: Num of nodes {classifier.Tree.Count}");
                // Test accuracy on training
                Console.WriteLine($"Confidence {confidence}: Accuracy on train = { trainingData.Where(instance => classifier.GetClass(instance) == instance[_classIndex]).Count() / (double)trainingData.Count}");

                // Test accuracy on test
                Console.WriteLine($"Confidence {confidence}: Accuracy on test = { testData.Where(instance => classifier.GetClass(instance) == instance[_classIndex]).Count() / (double)testData.Count}");

                StringBuilder sb            = new StringBuilder();
                StringBuilder sbMaxPositive = new StringBuilder();
                StringBuilder sbMaxNegative = new StringBuilder();
                int maxPositive             = int.MinValue;
                int maxNegative             = int.MinValue;
                // Only print small trees.
                if (confidence > 0.5)
                {
                    PrintTreeAsRules(sb, ref sbMaxPositive, ref sbMaxNegative, ref maxPositive, ref maxNegative, classifier.Tree, header);
                    sb.AppendLine("The most max positive rule is:");
                    sb.AppendLine(sbMaxPositive.ToString());
                    sb.AppendLine();
                    sb.AppendLine("The most max negative rule is:");
                    sb.AppendLine(sbMaxNegative.ToString());
                    Directory.CreateDirectory(_outputFolder);
                    File.WriteAllText(Path.Combine(_outputFolder, $"Tree{confidence}.txt"), sb.ToString());
                }
            });

            Console.WriteLine("Press ENTER to exit...");
            Console.ReadLine();
        }
Beispiel #12
0
        private void WriteArffFile(string[] allfiles)
        {
            using (ArffWriter arffWriter = new ArffWriter(textBox3.Text + ".arff"))
            {
                bool       relation = true;
                object[]   inst     = null;
                object[][] insts    = new object[allfiles.Length][];
                int        n        = 0;

                foreach (string file in allfiles)
                {
                    FileInfo fi = new FileInfo(file);

                    ArffReader datafile = new ArffReader(file);

                    ArffHeader header = datafile.ReadHeader();

                    var instances = datafile.ReadAllInstances();

                    //var instance = datafile.ReadInstance();


                    object[] instance = null;

                    foreach (var ins in instances)
                    {
                        instance = ins;
                    }

                    if (header != null)
                    {
                        var attributes = header.Attributes;

                        string line;

                        StreamReader attrfile = new StreamReader("attributes.txt");

                        int i = 0;
                        int j = 0;
                        while ((line = attrfile.ReadLine()) != null)
                        {
                            foreach (var attribute in attributes)
                            {
                                if (attribute.Name == line)
                                {
                                    if (relation == true)
                                    {
                                        arffWriter.WriteRelationName(header.RelationName);
                                        relation = false;
                                    }

                                    arffWriter.WriteAttribute(new ArffAttribute(attribute.Name, attribute.Type));

                                    j++;
                                }
                                i++;
                            }
                        }

                        inst = new object[j];
                        j    = 0;
                        i    = 0;

                        while ((line = attrfile.ReadLine()) != null)
                        {
                            foreach (var attribute in attributes)
                            {
                                if (attribute.Name == line)
                                {
                                    inst.SetValue(instance.GetValue(i), j);
                                    j++;
                                }
                                i++;
                            }
                        }

                        insts.SetValue(inst, n);

                        attrfile.Close();
                    }
                    datafile.Dispose();
                    n++;
                    progress++;
                }
                foreach (var ins in insts)
                {
                    arffWriter.WriteInstance(new object[] { ins });
                }
            }
        }
Beispiel #13
0
        static void Main(string[] args)
        {
            // Training
            ArffHeader      header    = null;
            List <object[]> instances = new List <object[]>();

            using (ArffReader arffReader = new ArffReader(TrainingArffFile))
            {
                header = arffReader.ReadHeader();
                object[] instance;
                while ((instance = arffReader.ReadInstance()) != null)
                {
                    instances.Add(instance);
                }
            }

            List <int[]> trainingData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray()));

            // Test
            instances = new List <object[]>();
            using (ArffReader arffReader = new ArffReader(TestArffFile))
            {
                header = arffReader.ReadHeader();
                object[] instance;
                while ((instance = arffReader.ReadInstance()) != null)
                {
                    instances.Add(instance);
                }
            }

            List <int[]> testData = new List <int[]>(instances.Select(objectArray => objectArray.Select(o => o == null ? -1 : (int)o).ToArray()));

            Console.WriteLine("Number of Samples, training accuracy, test accuracy");
            // Do this excercise multiple times as sampling is random.
            Parallel.For(0, 100, (k) =>
            {
                // Dictionaries to store results for different accuracies. KEEP IN SYNC :)
                ConcurrentDictionary <int, double> sampleTrainingAccuraciesMap = new ConcurrentDictionary <int, double>(new Dictionary <int, double>
                {
                    { 1, 0 },
                    { 3, 0 },
                    { 5, 0 },
                    { 10, 0 },
                    { 20, 0 },
                    { 25, 0 },
                    { 50, 0 },
                    { 75, 0 },
                    { 100, 0 }
                });
                ConcurrentDictionary <int, double> sampleTestAccuraciesMap = new ConcurrentDictionary <int, double>(new Dictionary <int, double>
                {
                    { 1, 0 },
                    { 3, 0 },
                    { 5, 0 },
                    { 10, 0 },
                    { 20, 0 },
                    { 25, 0 },
                    { 50, 0 },
                    { 75, 0 },
                    { 100, 0 }
                });

                // Calculate different sample accuracies in parallel.
                Parallel.ForEach(sampleTrainingAccuraciesMap.Keys, numOfSamples =>
                {
                    List <Id3Classifier> classifiers = new List <Id3Classifier>();
                    Sampler sampler = new Sampler(trainingData, numOfSamples);

                    for (int i = 0; i < numOfSamples; i++)
                    {
                        Id3Classifier classifier = new Id3Classifier(sampler.Samples[i], ClassIndex, Confidence);
                        classifiers.Add(classifier);
                    }

                    // Evaluate training and test to look out for overfitting.
                    sampleTrainingAccuraciesMap[numOfSamples] = Evaluate(trainingData, classifiers);
                    sampleTestAccuraciesMap[numOfSamples]     = Evaluate(testData, classifiers);
                });

                lock (_lockConsole)
                {
                    foreach (int numOfSamples in sampleTestAccuraciesMap.Keys.OrderBy(n => n))
                    {
                        Console.WriteLine($"{numOfSamples},{sampleTrainingAccuraciesMap[numOfSamples]},{sampleTestAccuraciesMap[numOfSamples]}");
                    }
                }
            });

            Console.WriteLine("Press ENTER to exit...");
            Console.ReadLine();
        }
Beispiel #14
0
        private void Learn()
        {
            using (ArffReader arffReader = new ArffReader(filePath))
            {
                header        = arffReader.ReadHeader();
                attributes    = new Dictionary <string, AttributeListInfo>(); //attribute name, attribute info
                attributeList = header.Attributes.ToList();

                foreach (var attr in attributeList)
                {
                    string[] array = attr.Type.ToString().Split(',');
                    if (decisionVariableType == AttributeType.Discrete && (array[0].ToString() == "numeric" || array[0].ToString() == "real" || array.Length == 1))
                    {
                        throw new InvalidOperationException("Variable Type Should be Continous, not Discrete");
                    }

                    List <string> myList = new List <string>();
                    foreach (string s in array)
                    {
                        //if s contains real, do decision tree learning for real values
                        myList.Add(s.Replace("{", "").Replace("}", "").Trim());
                    }

                    AttributeListInfo myAttributeInfo = new AttributeListInfo(); //define information for each attribute, then add to dictionary
                    myAttributeInfo.NumberOfInputs = myList.Count;               //array.Length
                    myAttributeInfo.Inputs         = myList;

                    attributes.Add(attr.Name, myAttributeInfo);

                    if (_attributes.Count < attributeList.Count - 1)
                    {
                        _attributes.Add(new Attribute(attr.Name, myAttributeInfo.NumberOfInputs));
                        queryList.Add(new Query(attr.Name, myList));
                    }
                    else
                    {
                        lastAttrName = attr.Name;
                    }
                }

                List <double[]> inputs2   = new List <double[]>();
                List <double[]> inputsAll = new List <double[]>();
                List <int>      outputs2  = new List <int>();
                object[]        instance;
                while ((instance = arffReader.ReadInstance()) != null)
                {
                    int      Length = instance.Length;
                    double[] newRow = new double[Length - 1];
                    double[] row    = new double[Length];
                    for (int i = 0; i < Length - 1; i++)
                    {
                        Double.TryParse(instance[i] + "", out newRow[i]);
                    }
                    for (int i = 0; i < Length; i++)
                    {
                        Double.TryParse(instance[i] + "", out row[i]);
                    }
                    inputs2.Add(newRow);
                    inputsAll.Add(row);
                    outputs2.Add(Int32.Parse(instance[Length - 1] + ""));
                }

                dInputs    = inputs2.ToArray();     //inputs (expect the last index)
                allDInputs = inputsAll.ToArray();   //all double inputs
                outputs    = outputs2.ToArray();

                //decide what algorithm to use based on options.
                if (learningAlgorithm == LearningAlgorithm.C45Learning && decisionVariableType == AttributeType.Continuous)
                {
                    c45 = new C45Algorithm();
                    var stopwatch = new Stopwatch();
                    stopwatch.Start();
                    tree = c45.Learn(dInputs, outputs);   //induce tree from data
                    stopwatch.Stop();
                    elapsedTime = stopwatch.ElapsedMilliseconds;
                }
                else if (learningAlgorithm == LearningAlgorithm.C45Learning && decisionVariableType == AttributeType.Both)
                {
                    c45 = new C45Algorithm();
                    var stopwatch = new Stopwatch();
                    stopwatch.Start();
                    tree = c45.Learn(dInputs, outputs);   //induce tree from data
                    stopwatch.Stop();
                    elapsedTime = stopwatch.ElapsedMilliseconds;
                }
                else if (learningAlgorithm == LearningAlgorithm.C45Learning && decisionVariableType == AttributeType.Discrete)
                {
                    c45 = new C45Algorithm(_attributes.ToArray());
                    var stopwatch = new Stopwatch();
                    stopwatch.Start();
                    tree = c45.Learn(dInputs, outputs);   //induce tree from data
                    stopwatch.Stop();
                    elapsedTime = stopwatch.ElapsedMilliseconds;
                }
                rules = tree.ToRules(attributes, decisionVariableType);

                predicted = tree.ComputeDecision(dInputs);
            }
        }