public HashSet <HtmlNode> RunOnTestSeenSet() { HashSet <HtmlNode> classifierSelectedNodes = new HashSet <HtmlNode>(); InitTestSeen(); foreach (string featureString in FeaturesUsed) { HashSet <HtmlNode> resNodes = DomPool.TESTSeenRunXpathQuery(useNormalPerformanceQUERY(featureString)); foreach (HtmlNode nd in resNodes) { if (!testSeenAllNodes.Contains(nd)) { continue; } testSeenNodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances testSet = new Instances("TestSeenSet", fvWekaAttributes, 10); testSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in testSeenAllNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (testSeenNodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); //string rightVal = DomPool.TargetNodes.Contains(currNode) ? "yes" : "no"; item.setDataset(testSet); double classifierdv = classifierTree.classifyInstance(item); string classifierVal = classFeature.value((int)classifierdv); if (classifierVal.Equals("yes")) { classifierSelectedNodes.Add(currNode); } testSet.add(item); } return(classifierSelectedNodes); }
public void get_value_instance() { var dValues = new[] { 1f, 0f, 3f, 4f, 1f, 0f, 1f, 0f }; var sOrdinalValues = new[] { 1f, 0f, 3f, 4f }; var dBinaryOffset = sOrdinalValues.Length; var sValues = new[] { 1f, 3f, 4f }; var sIndices = new[] { 0, 2, 3 }; var sBinaryValues = new[] { 1f, 0f, 1f, 0f }; var sBinaryIndices = new[] { 0, 2 }; var dInstance = new DenseInstance( dValues, dBinaryOffset); var sInstance = new SparseInstance( sValues, sIndices, sBinaryIndices, sOrdinalValues.Length, sBinaryValues.Length); var sOutputValues = sInstance.GetValues(); var dOutputValues = dInstance.GetValues(); Assert.Equal(3f, sInstance.GetValue(2)); Assert.Equal(3f, dInstance.GetValue(2)); Assert.Equal(0f, sInstance.GetValue(7)); Assert.Equal(0f, dInstance.GetValue(7)); Assert.True(Math.Abs(dOutputValues.L2Norm() - sOutputValues.L2Norm()) < Epsilon); }
private PmlInstance CreateInstanceForRow(T row) { var extendable = row is ExtendableObjBase ? (row as ExtendableObjBase).GetExtendedPropertiesValues() : new object [0]; var indexes = Enumerable. Range(0, properties.Length + extendable.Count + tohasclass.Length). ToList(); if (resuable_rowvals == null) { resuable_rowvals = new double[properties.Length + extendable.Count + tohasclass.Length + tobinarize.Length]; } for (int i = 0; i < properties.Length; i++) { resuable_rowvals[i] = GetValue(attributes[i], Helpers.GetValue(row, properties[i])); } for (int i = 0; i < extendable.Count; i++) { resuable_rowvals[properties.Length + i] = GetValue(attributes[properties.Length + i], extendable.ElementAt(i)); } for (int i = 0; i < tohasclass.Length; i++) { var val = Helpers.GetValue(row, tohasclass[i]); resuable_rowvals[properties.Length + extendable.Count + i] = cached_has_class[tohasclass[i]].ContainsKey(val) ? 1.0 : 0.0; } for (int i = 0; i < tobinarize.Length; i++) { resuable_rowvals[properties.Length + extendable.Count + tohasclass.Length + i] = 1.0; var name = tobinarize[i] + "_" + InternalHelpers.ToStringSafe(Helpers.GetValue(row, tobinarize[i])); try { indexes.Add(attributes_indexes_cache[name]); } catch (KeyNotFoundException) { Console.WriteLine("Could not find '" + name + "' in the attributes_indexes_cache."); throw; } } var impl = new SparseInstance(1.0, resuable_rowvals, indexes.ToArray(), attributes.Count); return(new PmlInstance(impl)); }
public void dist_sparse_instance() { var ordinalValues1 = new[] { 1f, 0f, 3f, 4f }; var values1 = new[] { 1f, 3f, 4f }; var indices1 = new[] { 0, 2, 3 }; var binaryValues1 = new[] { 1f, 0f, 1f, 0f }; var binaryIndices1 = new[] { 0, 2 }; var ordinalValues2 = new[] { 5f, 6f, 0f, 8f }; var values2 = new[] { 5f, 6f, 8f }; var indices2 = new[] { 0, 1, 3 }; var binaryValues2 = new[] { 0f, 1f, 1f, 0f }; var binaryIndices2 = new[] { 1, 2 }; var t = new[, ] { { 1f, 0f, 3f, 4f, 1f, 0f, 1f, 0f }, { 5f, 6f, 0f, 8f, 0f, 1f, 1f, 0f } }; var instance1 = new SparseInstance( values1, indices1, binaryIndices1, ordinalValues1.Length, binaryValues1.Length); var instance2 = new SparseInstance( values2, indices2, binaryIndices2, ordinalValues2.Length, binaryValues2.Length); var distInstance1 = instance1.L2Dist(instance2); var distInstance2 = instance1.L2Dist(t, 1, 8); var distArray = t.L2Dist(0, 1, 8); Assert.True(Math.Abs(distInstance1 - distArray) < Epsilon); Assert.True(Math.Abs(distInstance2 - distArray) < Epsilon); }
/// <summary> /// erstellt eine WEKA-<see cref="Instance"/> und fügt sie den <see cref="Instances"/> hinzu /// </summary> /// <param name="instances">Datensatz</param> /// <param name="className">Name der Klasse, falls null wird die erste Klasse aus dem Datensatz zugewiesen</param> /// <param name="features">Merkmale</param> /// <param name="forceAdd">wenn true wird eine unklassifizierte <see cref="Instance"/> dem Datensatz hinzugefügt, sonst wird keine <see cref="Instance"/> erzeugt</param> /// <returns>WEKA-<see cref="Instance"/>, eingehangen in den übergebenen Datensatz oder null</returns> public static Instance CreateWekaInstance(this Instances instances, string className, IEnumerable <Tuple <string, double> > features, bool forceAdd = false) { var instance = new SparseInstance(instances.numAttributes()); foreach (var feature in features) { var attribute = instances.attribute(feature.Item1); if (null != attribute) { instance.setValue(attribute, feature.Item2); } } var classIndex = null == className?-1:instances.classAttribute().indexOfValue(className); if (!forceAdd && -1 == classIndex) { return(null); } instance.setValue(instances.classAttribute(), classIndex); instances.add(instance); return(instances.lastInstance()); }
// Used in Realtime: This is a helper method to create instances from the internal model files public static void CreateInstances(Instances instances) { int[] realKeyIndices = new int[TrainingTesting_SharedVariables._trainTopIGFeatures.Length]; int[] indices = new int[TrainingTesting_SharedVariables._trainTopIGFeatures.Length]; double[] values = new double[TrainingTesting_SharedVariables._trainTopIGFeatures.Length]; int i = 0; int key; foreach (int k in TrainingTesting_SharedVariables._trainTopIGFeatures) { //samples is 1-based while key is 0-based, we need to convert the 0-based index to a 1-based index by adding 1. key = k + 1; if (Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].ContainsKey(key)) { realKeyIndices[i] = key; indices[i] = i; values[i] = Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector][key]; } else { realKeyIndices[i] = key; indices[i] = i; if (i == TrainingTesting_SharedVariables._trainTopIGFeatures.Length - 1) { values[i] = 2; // assign a class, '2' is temporarily, should be based on something else or completely ignored, does it matter which class it issince this is testing. } //else // values[i] = 1; // is it proper to put 0 as a missing a value in a sparse array for a value which is not in the original problem? } i += 1; } //int[] indices = Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].Keys.ToArray(); //double[] values = Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].Values.ToArray(); //init statistical matrices. (done once) if (Preferences.Instance.MatrixStatistics == null) { Preferences.Instance.MatrixStatistics = new StatisticsFeatures(TrainingTesting_SharedVariables._trainTopIGFeatures.Length, Preferences.Instance.events.EventListLastTr, realKeyIndices, false); } if (Preferences.Instance.MatrixStatisticsNormalized == null) { Preferences.Instance.MatrixStatisticsNormalized = new StatisticsFeatures(TrainingTesting_SharedVariables._trainTopIGFeatures.Length, Preferences.Instance.events.EventListLastTr, realKeyIndices, false); } if (Preferences.Instance.TestingBaselineStatistics == null) { Preferences.Instance.TestingBaselineStatistics = new StatisticsFeatures(TrainingTesting_SharedVariables._trainTopIGFeatures.Length, Preferences.Instance.events.eventList[0].var2, realKeyIndices, true); } //update matrix Preferences.Instance.MatrixStatistics.updateMatrix(Preferences.Instance.currentClassifiedVector, values); Preferences.Instance.TestingBaselineStatistics.updateMatrix(Preferences.Instance.currentClassifiedVector, values); //normalize real time raw values. if (!TrainingTesting_SharedVariables._svmscaleTesting.NormalizeSingleVector(realKeyIndices, ref values)) { GuiPreferences.Instance.setLog("normalization failed"); return; } Preferences.Instance.MatrixStatisticsNormalized.updateMatrix(Preferences.Instance.currentClassifiedVector, values); //value is destroyed in the normalization stage, we reassign the fake class (not used in the testing). values[TrainingTesting_SharedVariables._trainTopIGFeatures.Length - 1] = 2; var instance = new SparseInstance(1.0f, values, indices, indices.Length); /*instances.setClassIndex(instances.numAttributes() - 1); * * var instance = new SparseInstance(instances.numAttributes()); * instance.setDataset(instances); * * foreach (int key in Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector].Keys) * { * //var attribute = instances.attribute(pair.Key); * //double value = pair.Value; * //if (attribute.isNumeric()) * instance.setValue(instances.attribute(key), Preferences.Instance.ProblemOriginal.samples[Preferences.Instance.currentClassifiedVector][key]); * //else * //instance.setValue(instances.attribute(pair.Key), value.ToString()); * }*/ instances.add(instance); instances.setClassIndex(TrainingTesting_SharedVariables._trainTopIGFeatures.Length - 1); }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 100); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } String[] options = new String[2]; options[0] = "-C"; // unpruned tree options[1] = "0.1"; J48 tree = new J48(); // new instance of tree tree.setOptions(options); // set the options tree.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifierTree = tree; Reader treeDot = new StringReader(tree.graph()); TreeBuild treeBuild = new TreeBuild(); Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = getTreeFeatures(treeRoot); }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 10); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } //String[] options = new String[2]; //options = new string[] { "-C", "0.05" }; // unpruned tree NaiveBayes cls = new NaiveBayes(); // new instance of tree //cls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\"")); //cls.setOptions(options); // set the options cls.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifier = cls; // Reader treeDot = new StringReader(tree.graph()); // TreeBuild treeBuild = new TreeBuild(); // Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = new HashSet <string>(); foreach (Feature f in DomPool.SelectorFeatures) { FeaturesUsed.Add(f.ToString()); } }