public HashSet <HtmlNode> RunOnTestSeenSet()
        {
            HashSet <HtmlNode> classifierSelectedNodes = new HashSet <HtmlNode>();

            InitTestSeen();
            foreach (string featureString in FeaturesUsed)
            {
                HashSet <HtmlNode> resNodes = DomPool.TESTSeenRunXpathQuery(useNormalPerformanceQUERY(featureString));
                foreach (HtmlNode nd in resNodes)
                {
                    if (!testSeenAllNodes.Contains(nd))
                    {
                        continue;
                    }
                    testSeenNodeFeatures[nd].Add(featureString);
                }
            }

            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  testSet          = new Instances("TestSeenSet", fvWekaAttributes, 10);

            testSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in testSeenAllNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (testSeenNodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                //string rightVal = DomPool.TargetNodes.Contains(currNode) ? "yes" : "no";
                item.setDataset(testSet);



                double classifierdv  = classifierTree.classifyInstance(item);
                string classifierVal = classFeature.value((int)classifierdv);

                if (classifierVal.Equals("yes"))
                {
                    classifierSelectedNodes.Add(currNode);
                }

                testSet.add(item);
            }

            return(classifierSelectedNodes);
        }
Ejemplo n.º 2
0
        public void get_value_instance()
        {
            var dValues = new[] { 1f, 0f, 3f, 4f, 1f, 0f, 1f, 0f };

            var sOrdinalValues = new[] { 1f, 0f, 3f, 4f };
            var dBinaryOffset  = sOrdinalValues.Length;

            var sValues  = new[] { 1f, 3f, 4f };
            var sIndices = new[] { 0, 2, 3 };

            var sBinaryValues  = new[] { 1f, 0f, 1f, 0f };
            var sBinaryIndices = new[] { 0, 2 };

            var dInstance = new DenseInstance(
                dValues,
                dBinaryOffset);

            var sInstance = new SparseInstance(
                sValues,
                sIndices,
                sBinaryIndices,
                sOrdinalValues.Length,
                sBinaryValues.Length);

            var sOutputValues = sInstance.GetValues();
            var dOutputValues = dInstance.GetValues();

            Assert.Equal(3f, sInstance.GetValue(2));
            Assert.Equal(3f, dInstance.GetValue(2));

            Assert.Equal(0f, sInstance.GetValue(7));
            Assert.Equal(0f, dInstance.GetValue(7));

            Assert.True(Math.Abs(dOutputValues.L2Norm() - sOutputValues.L2Norm()) < Epsilon);
        }
Ejemplo n.º 3
0
        private PmlInstance CreateInstanceForRow(T row)
        {
            var extendable = row is ExtendableObjBase ?
                             (row as ExtendableObjBase).GetExtendedPropertiesValues() : new object [0];
            var indexes = Enumerable.
                          Range(0, properties.Length + extendable.Count + tohasclass.Length).
                          ToList();

            if (resuable_rowvals == null)
            {
                resuable_rowvals = new double[properties.Length + extendable.Count + tohasclass.Length + tobinarize.Length];
            }

            for (int i = 0; i < properties.Length; i++)
            {
                resuable_rowvals[i] = GetValue(attributes[i], Helpers.GetValue(row, properties[i]));
            }

            for (int i = 0; i < extendable.Count; i++)
            {
                resuable_rowvals[properties.Length + i] =
                    GetValue(attributes[properties.Length + i], extendable.ElementAt(i));
            }

            for (int i = 0; i < tohasclass.Length; i++)
            {
                var val = Helpers.GetValue(row, tohasclass[i]);
                resuable_rowvals[properties.Length + extendable.Count + i] = cached_has_class[tohasclass[i]].ContainsKey(val) ? 1.0 : 0.0;
            }

            for (int i = 0; i < tobinarize.Length; i++)
            {
                resuable_rowvals[properties.Length + extendable.Count + tohasclass.Length + i] = 1.0;
                var name = tobinarize[i] + "_" +
                           InternalHelpers.ToStringSafe(Helpers.GetValue(row, tobinarize[i]));
                try { indexes.Add(attributes_indexes_cache[name]); }
                catch (KeyNotFoundException) {
                    Console.WriteLine("Could not find '" + name + "' in the attributes_indexes_cache.");
                    throw;
                }
            }
            var impl = new SparseInstance(1.0, resuable_rowvals, indexes.ToArray(), attributes.Count);

            return(new PmlInstance(impl));
        }
Ejemplo n.º 4
0
        public void dist_sparse_instance()
        {
            var ordinalValues1 = new[] { 1f, 0f, 3f, 4f };
            var values1        = new[] { 1f, 3f, 4f };
            var indices1       = new[] { 0, 2, 3 };

            var binaryValues1  = new[] { 1f, 0f, 1f, 0f };
            var binaryIndices1 = new[] { 0, 2 };

            var ordinalValues2 = new[] { 5f, 6f, 0f, 8f };
            var values2        = new[] { 5f, 6f, 8f };
            var indices2       = new[] { 0, 1, 3 };

            var binaryValues2  = new[] { 0f, 1f, 1f, 0f };
            var binaryIndices2 = new[] { 1, 2 };

            var t = new[, ]
            {
                { 1f, 0f, 3f, 4f, 1f, 0f, 1f, 0f },
                { 5f, 6f, 0f, 8f, 0f, 1f, 1f, 0f }
            };

            var instance1 = new SparseInstance(
                values1,
                indices1,
                binaryIndices1,
                ordinalValues1.Length,
                binaryValues1.Length);

            var instance2 = new SparseInstance(
                values2,
                indices2,
                binaryIndices2,
                ordinalValues2.Length,
                binaryValues2.Length);

            var distInstance1 = instance1.L2Dist(instance2);
            var distInstance2 = instance1.L2Dist(t, 1, 8);
            var distArray     = t.L2Dist(0, 1, 8);

            Assert.True(Math.Abs(distInstance1 - distArray) < Epsilon);
            Assert.True(Math.Abs(distInstance2 - distArray) < Epsilon);
        }
        /// <summary>
        /// erstellt eine WEKA-<see cref="Instance"/> und fügt sie den <see cref="Instances"/> hinzu
        /// </summary>
        /// <param name="instances">Datensatz</param>
        /// <param name="className">Name der Klasse, falls null wird die erste Klasse aus dem Datensatz zugewiesen</param>
        /// <param name="features">Merkmale</param>
        /// <param name="forceAdd">wenn true wird eine unklassifizierte <see cref="Instance"/> dem Datensatz hinzugefügt, sonst wird keine <see cref="Instance"/> erzeugt</param>
        /// <returns>WEKA-<see cref="Instance"/>, eingehangen in den übergebenen Datensatz oder null</returns>
        public static Instance CreateWekaInstance(this Instances instances, string className, IEnumerable <Tuple <string, double> > features, bool forceAdd = false)
        {
            var instance = new SparseInstance(instances.numAttributes());

            foreach (var feature in features)
            {
                var attribute = instances.attribute(feature.Item1);
                if (null != attribute)
                {
                    instance.setValue(attribute, feature.Item2);
                }
            }
            var classIndex = null == className?-1:instances.classAttribute().indexOfValue(className);

            if (!forceAdd && -1 == classIndex)
            {
                return(null);
            }
            instance.setValue(instances.classAttribute(), classIndex);
            instances.add(instance);
            return(instances.lastInstance());
        }
Ejemplo n.º 6
0
        // Used in Realtime: This is a helper method to create instances from the internal model files
        public static void CreateInstances(Instances instances)
        {
            int[]    realKeyIndices = new int[TrainingTesting_SharedVariables._trainTopIGFeatures.Length];
            int[]    indices        = new int[TrainingTesting_SharedVariables._trainTopIGFeatures.Length];
            double[] values         = new double[TrainingTesting_SharedVariables._trainTopIGFeatures.Length];
            int      i = 0;
            int      key;

            foreach (int k in TrainingTesting_SharedVariables._trainTopIGFeatures)
            {
                //samples is 1-based while key is 0-based, we need to convert the 0-based index to a 1-based index by adding 1.
                key = k + 1;
                if (Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].ContainsKey(key))
                {
                    realKeyIndices[i] = key;

                    indices[i] = i;
                    values[i]  = Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector][key];
                }
                else
                {
                    realKeyIndices[i] = key;

                    indices[i] = i;
                    if (i == TrainingTesting_SharedVariables._trainTopIGFeatures.Length - 1)
                    {
                        values[i] = 2; //  assign a class, '2' is temporarily, should be based on something else or completely ignored, does it matter which class it issince this is testing.
                    }
                    //else
                    //    values[i] = 1;  // is it proper to put 0 as a missing a value in a sparse array for a value which is not in the original problem?
                }
                i += 1;
            }

            //int[] indices = Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].Keys.ToArray();
            //double[] values = Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].Values.ToArray();

            //init statistical matrices. (done once)
            if (Preferences.Instance.MatrixStatistics == null)
            {
                Preferences.Instance.MatrixStatistics = new StatisticsFeatures(TrainingTesting_SharedVariables._trainTopIGFeatures.Length, Preferences.Instance.events.EventListLastTr, realKeyIndices, false);
            }
            if (Preferences.Instance.MatrixStatisticsNormalized == null)
            {
                Preferences.Instance.MatrixStatisticsNormalized = new StatisticsFeatures(TrainingTesting_SharedVariables._trainTopIGFeatures.Length, Preferences.Instance.events.EventListLastTr, realKeyIndices, false);
            }
            if (Preferences.Instance.TestingBaselineStatistics == null)
            {
                Preferences.Instance.TestingBaselineStatistics = new StatisticsFeatures(TrainingTesting_SharedVariables._trainTopIGFeatures.Length, Preferences.Instance.events.eventList[0].var2, realKeyIndices, true);
            }

            //update matrix
            Preferences.Instance.MatrixStatistics.updateMatrix(Preferences.Instance.currentClassifiedVector, values);
            Preferences.Instance.TestingBaselineStatistics.updateMatrix(Preferences.Instance.currentClassifiedVector, values);

            //normalize real time raw values.
            if (!TrainingTesting_SharedVariables._svmscaleTesting.NormalizeSingleVector(realKeyIndices, ref values))
            {
                GuiPreferences.Instance.setLog("normalization failed");
                return;
            }

            Preferences.Instance.MatrixStatisticsNormalized.updateMatrix(Preferences.Instance.currentClassifiedVector, values);

            //value is destroyed in the normalization stage, we reassign the fake class (not used in the testing).
            values[TrainingTesting_SharedVariables._trainTopIGFeatures.Length - 1] = 2;

            var instance = new SparseInstance(1.0f, values, indices, indices.Length);

            /*instances.setClassIndex(instances.numAttributes() - 1);
             *
             * var instance = new SparseInstance(instances.numAttributes());
             * instance.setDataset(instances);
             *
             * foreach (int key in Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].Keys)
             * {
             *  //var attribute = instances.attribute(pair.Key);
             *  //double value = pair.Value;
             *  //if (attribute.isNumeric())
             *  instance.setValue(instances.attribute(key), Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector][key]);
             *  //else
             *      //instance.setValue(instances.attribute(pair.Key), value.ToString());
             * }*/
            instances.add(instance);
            instances.setClassIndex(TrainingTesting_SharedVariables._trainTopIGFeatures.Length - 1);
        }
        public void LearnModel()
        {
            Init();
            foreach (Feature currFeature in DomPool.SelectorFeatures)
            {
                String             featureString = currFeature.ToString();
                HashSet <HtmlNode> resNodes      = DomPool.RunXpathQuery(featureString);
                foreach (HtmlNode nd in resNodes)
                {
                    if (!allNodes.Contains(nd))
                    {
                        continue;
                    }
                    nodeFeatures[nd].Add(featureString);
                }
            }

            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  trainingSet      = new Instances("TS", fvWekaAttributes, 100);

            trainingSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in allNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (nodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no"));
                item.setDataset(trainingSet);
                if (DomPool.TargetNodes.Contains(currNode))
                {
                    for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++)
                    {
                        trainingSet.add(new SparseInstance(item));
                    }
                }
                else
                {
                    trainingSet.add(item);
                }
            }

            String[] options = new String[2];
            options[0] = "-C";                 // unpruned tree
            options[1] = "0.1";
            J48 tree = new J48();              // new instance of tree

            tree.setOptions(options);          // set the options
            tree.buildClassifier(trainingSet); // build classifier
            //save the resulting classifier
            classifierTree = tree;

            Reader    treeDot   = new StringReader(tree.graph());
            TreeBuild treeBuild = new TreeBuild();
            Node      treeRoot  = treeBuild.create(treeDot);

            FeaturesUsed = getTreeFeatures(treeRoot);
        }
Ejemplo n.º 8
0
        public void LearnModel()
        {
            Init();
            foreach (Feature currFeature in DomPool.SelectorFeatures)
            {
                String             featureString = currFeature.ToString();
                HashSet <HtmlNode> resNodes      = DomPool.RunXpathQuery(featureString);
                foreach (HtmlNode nd in resNodes)
                {
                    if (!allNodes.Contains(nd))
                    {
                        continue;
                    }
                    nodeFeatures[nd].Add(featureString);
                }
            }
            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  trainingSet      = new Instances("TS", fvWekaAttributes, 10);

            trainingSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in allNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (nodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no"));
                item.setDataset(trainingSet);
                if (DomPool.TargetNodes.Contains(currNode))
                {
                    for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++)
                    {
                        trainingSet.add(new SparseInstance(item));
                    }
                }
                else
                {
                    trainingSet.add(item);
                }
            }

            //String[] options = new String[2];
            //options = new string[] { "-C", "0.05" };            // unpruned tree
            NaiveBayes cls = new NaiveBayes();         // new instance of tree

            //cls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\""));
            //cls.setOptions(options);     // set the options
            cls.buildClassifier(trainingSet);  // build classifier
            //save the resulting classifier
            classifier = cls;

            //  Reader treeDot = new StringReader(tree.graph());
            //  TreeBuild treeBuild = new TreeBuild();
            //  Node treeRoot = treeBuild.create(treeDot);
            FeaturesUsed = new HashSet <string>();

            foreach (Feature f in DomPool.SelectorFeatures)
            {
                FeaturesUsed.Add(f.ToString());
            }
        }