public static HashSet <Feature> KeepTopK(HashSet <Feature> featureSet, int k) { //this line is here to disable the filtering, it costs in performance much more than it saves if (featureSet.Count() <= k * 200) { return(featureSet); } HashSet <Feature> finalRes = null; LinkedList <object[]> toSort = new LinkedList <object[]>(); foreach (Feature f in featureSet) { HashSet <HtmlNode> res = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>(new Feature[] { f }))); HashSet <HtmlNode> selectedPos = new HashSet <HtmlNode>(res.Intersect(DomPool.TargetNodes)); double entropy = Statistics.CalculateEntropy(((double)selectedPos.Count() / res.Count()), 1 - ((double)selectedPos.Count() / res.Count())); object[] toSortObj = new object[2]; toSortObj[0] = f; toSortObj[1] = entropy; toSort.AddFirst(toSortObj); } var resTopK = toSort.OrderBy(x => ((double)(x[1]))).Select(x => (Feature)(x[0])).Take(k); finalRes = new HashSet <Feature>(resTopK.ToList()); return(finalRes); }
public HashSet <HtmlNode> selectTrue(HashSet <HtmlNode> nodes, HashSet <Feature> prevFeatures, Boolean right, double threshold = 1) { if (this.precision >= threshold) { return(nodes); } HashSet <Feature> currFeature = new HashSet <Feature>(this.FeatureSet.Except(prevFeatures)); if (currFeature.Count() == 0) { if (right) { return(nodes); } else { return(new HashSet <HtmlNode>()); } } Feature cf = currFeature.First(); HashSet <HtmlNode> featureRes = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>() { cf })); featureRes.IntersectWith(nodes); HashSet <HtmlNode> rightRes = this.SetSelected.selectTrue(featureRes, this.FeatureSet, true, threshold); HashSet <HtmlNode> leftRes = this.SetNotSelected.selectTrue(nodes, prevFeatures, false, threshold); return(new HashSet <HtmlNode>(rightRes.Union(leftRes))); }
public static void ImproveTree(DecisionNode dn, int level) { double maxScore = 0; Feature maxGainFeature = null;; HashSet <HtmlNode> newFeatureSelected = null; Object lockObj = new object(); double balanceFix = Math.Max(1, (Math.Pow(0.3, Math.Sqrt(level + 1))) * (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count())); double dnEntropy = dn.CalculateEntropy(1, balanceFix); Parallel.ForEach(DomPool.SelectorFeatures, (currCandidate) => { HashSet <Feature> newSelectorSet = new HashSet <Feature>(dn.FeatureSet); newSelectorSet.Add(currCandidate); string currFeatureXpath = XpathTools.FeatureSetToXpath(new HashSet <Feature>() { currCandidate }); HashSet <HtmlNode> currFeatureXpathSelected = DomPool.RunXpathQuery(currFeatureXpath); HashSet <HtmlNode> xpathSelected = new HashSet <HtmlNode>(currFeatureXpathSelected.Intersect(dn.InitialNodeSet)); HashSet <HtmlNode> xpathCurrSelected = new HashSet <HtmlNode>(dn.InitialNodeSet.Intersect(xpathSelected)); HashSet <HtmlNode> xpathCurrNotSelected = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(xpathCurrSelected)); //calculate information gain HashSet <HtmlNode> currSelectedPositive = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedPositive)); HashSet <HtmlNode> currSelectedNegative = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedNegative)); HashSet <HtmlNode> currNotSelectedPositive = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedPositive)); HashSet <HtmlNode> currNotSelectedNegative = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedNegative)); double sp = ((double)currSelectedPositive.Count()) / xpathCurrSelected.Count(); double sn = ((double)currSelectedNegative.Count()) / xpathCurrSelected.Count(); double relativeRecall = ((double)currSelectedPositive.Count()) / ((double)dn.SelectedPositive.Count()); //FIX: sn = sn / (1 + Math.Pow(0, level + 1)); sn = sn / balanceFix; double selectedProbability = ((double)xpathCurrSelected.Count()) / dn.InitialNodeSet.Count(); double selectedEntropy = Statistics.CalculateEntropy(sp, sn); double nsp = ((double)currNotSelectedPositive.Count()) / xpathCurrNotSelected.Count(); double nsn = 1 - nsp; // Apply Fix nsn = nsn / balanceFix; double notselectedProbability = 1 - selectedProbability; double notSelectedEntropy = Statistics.CalculateEntropy(nsp, nsn); double balanceFixProb = balanceFix; double sumTemp = (selectedProbability * sn + selectedProbability * sp * balanceFixProb + notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb); selectedProbability = (selectedProbability * sn + selectedProbability * sp * balanceFixProb) / sumTemp; notselectedProbability = (notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb) / sumTemp; double gain = dnEntropy - ((selectedProbability * selectedEntropy) + (notselectedProbability * notSelectedEntropy)); double phaseOfDecrease = 1; if (DomPool.trainingDocsNames.Count() > 3) { phaseOfDecrease = 3 / DomPool.trainingDocsNames.Count(); } //Choose the most cost effective feature gain = gain / (currCandidate.cost + (((1 - relativeRecall) + (1 - ((double)DomPool.FeatureFrequencey[currCandidate.feature.First().ToLower()]) / DomPool.trainingDocsNames.Count))) * Math.Pow(0.3, level)); lock (lockObj) { if (gain > maxScore && sp > nsp) { maxScore = gain; maxGainFeature = currCandidate; newFeatureSelected = xpathCurrSelected; } } }); if (maxGainFeature == null) { return; } dn.SetSelected = new DecisionNode(); dn.SetSelected.InitialNodeSet = newFeatureSelected; dn.SetSelected.FeatureSet = new HashSet <Feature>(dn.FeatureSet); dn.SetSelected.FeatureSet.Add(maxGainFeature); dn.SetSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedNegative)); dn.SetSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedPositive)); dn.SetSelected.CalculateEntropy(); dn.SetNotSelected = new DecisionNode(); dn.SetNotSelected.InitialNodeSet = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(newFeatureSelected)); //FIX FOR NOT BRANCH, INSTEAD OF HAVING THE NOT. if (FixEnabledForNotBranch) { dn.SetNotSelected.InitialNodeSet.UnionWith(dn.SetSelected.SelectedNegative); } dn.SetNotSelected.FeatureSet = new HashSet <Feature>(dn.FeatureSet); dn.SetNotSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedNegative)); dn.SetNotSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedPositive)); dn.SetNotSelected.CalculateEntropy(); dn.FeatureSet.Add(maxGainFeature); }
static void Main(string[] args) { Console.WriteLine("T for test, R for Run, S for seen overall testing and O for overall testing:"); string res = ReadLine(); if (res.ToLower().Trim().Equals("huge")) { TestSites.TestAllSites(); return; } if (res.ToLower().Trim().Equals("t")) { DomPool.LoadDocuments(FILES_LOCATION); DomPool.Initiate(); Console.WriteLine("insert query:"); string q = ReadLine(); while (!q.Equals("exit")) { var runres = DomPool.RunXpathQuery(q); if (runres != null) { Console.WriteLine("result size" + runres.Count()); HashSet <HtmlNode> spos = new HashSet <HtmlNode>(DomPool.TargetNodes.Intersect(runres)); HashSet <HtmlNode> sposprecision = new HashSet <HtmlNode>(DomPool.TargetNodesPrecision.Intersect(runres)); foreach (var entry in DomPool.docsAndNames) { HashSet <HtmlNode> docNodes = new HashSet <HtmlNode>(entry.Value.SelectNodes("//*")); HashSet <HtmlNode> currspos = new HashSet <HtmlNode>(spos.Intersect(docNodes)); HashSet <HtmlNode> currrunres = new HashSet <HtmlNode>(runres.Intersect(docNodes)); HashSet <HtmlNode> currsposprecision = new HashSet <HtmlNode>(sposprecision.Intersect(docNodes)); HashSet <HtmlNode> currTargetNodes = new HashSet <HtmlNode>(DomPool.TargetNodes.Intersect(docNodes)); Console.WriteLine(entry.Key + "-Accuracy:" + (currsposprecision.Count() / ((double)currrunres.Count())) + ". Recall:" + (currspos.Count() / ((double)currTargetNodes.Count())) + ""); } Console.WriteLine("Accuracy:" + (sposprecision.Count() / ((double)runres.Count())) + ". Recall:" + (spos.Count() / ((double)DomPool.TargetNodes.Count())) + ""); } else { Console.WriteLine("null"); } Console.WriteLine("insert query:"); q = ReadLine(); } } else { if (res.ToLower().Trim().Equals("r")) { Console.WriteLine(LearnXpathWrapper.LearnXpathFromTrainingFiles(FILES_LOCATION)); Console.ReadLine(); } else { if (res.ToLower().Trim().Equals("s")) { Console.WriteLine("Output is redirected to resultsSeen.txt in the debug dir"); //write results to text file instead of windows FileStream fs = new FileStream("resultsSeen.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); TextWriter tmp = Console.Out; Console.SetOut(sw); OverallSeenTesting.RunTest(FILES_LOCATION); Console.SetOut(tmp); sw.Flush(); sw.Close(); } else { if (res.ToLower().Trim().Equals("archive")) { Console.WriteLine("Output is redirected to results.txt in the debug dir"); //write results to text file instead of windows FileStream fs = new FileStream("archive2-results.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); TextWriter tmp = Console.Out; Console.SetOut(sw); OverallArchive2Testing.RunTest(ARCHIVE_FILES_LOCATION); Console.SetOut(tmp); sw.Flush(); sw.Close(); } else { if (res.ToLower().Trim().Equals("a")) { Console.WriteLine("Please enter file name to parse:"); string fnp = ReadLine().Trim(); parseres.learn(fnp); parseres.save("parsed" + fnp); } else { Console.WriteLine("Output is redirected to results.txt in the debug dir"); //write results to text file instead of windows FileStream fs = new FileStream("results.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); TextWriter tmp = Console.Out; Console.SetOut(sw); OverallTesting.RunTest(FILES_LOCATION); Console.SetOut(tmp); sw.Flush(); sw.Close(); } } } } } }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 100); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } String[] options = new String[2]; options[0] = "-C"; // unpruned tree options[1] = "0.1"; J48 tree = new J48(); // new instance of tree tree.setOptions(options); // set the options tree.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifierTree = tree; Reader treeDot = new StringReader(tree.graph()); TreeBuild treeBuild = new TreeBuild(); Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = getTreeFeatures(treeRoot); }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 10); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } //String[] options = new String[2]; //options = new string[] { "-C", "0.05" }; // unpruned tree NaiveBayes cls = new NaiveBayes(); // new instance of tree //cls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\"")); //cls.setOptions(options); // set the options cls.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifier = cls; // Reader treeDot = new StringReader(tree.graph()); // TreeBuild treeBuild = new TreeBuild(); // Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = new HashSet <string>(); foreach (Feature f in DomPool.SelectorFeatures) { FeaturesUsed.Add(f.ToString()); } }