public static HashSet <Feature> KeepTopK(HashSet <Feature> featureSet, int k) { //this line is here to disable the filtering, it costs in performance much more than it saves if (featureSet.Count() <= k * 200) { return(featureSet); } HashSet <Feature> finalRes = null; LinkedList <object[]> toSort = new LinkedList <object[]>(); foreach (Feature f in featureSet) { HashSet <HtmlNode> res = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>(new Feature[] { f }))); HashSet <HtmlNode> selectedPos = new HashSet <HtmlNode>(res.Intersect(DomPool.TargetNodes)); double entropy = Statistics.CalculateEntropy(((double)selectedPos.Count() / res.Count()), 1 - ((double)selectedPos.Count() / res.Count())); object[] toSortObj = new object[2]; toSortObj[0] = f; toSortObj[1] = entropy; toSort.AddFirst(toSortObj); } var resTopK = toSort.OrderBy(x => ((double)(x[1]))).Select(x => (Feature)(x[0])).Take(k); finalRes = new HashSet <Feature>(resTopK.ToList()); return(finalRes); }
public HashSet <HtmlNode> selectTrue(HashSet <HtmlNode> nodes, HashSet <Feature> prevFeatures, Boolean right, double threshold = 1) { if (this.precision >= threshold) { return(nodes); } HashSet <Feature> currFeature = new HashSet <Feature>(this.FeatureSet.Except(prevFeatures)); if (currFeature.Count() == 0) { if (right) { return(nodes); } else { return(new HashSet <HtmlNode>()); } } Feature cf = currFeature.First(); HashSet <HtmlNode> featureRes = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>() { cf })); featureRes.IntersectWith(nodes); HashSet <HtmlNode> rightRes = this.SetSelected.selectTrue(featureRes, this.FeatureSet, true, threshold); HashSet <HtmlNode> leftRes = this.SetNotSelected.selectTrue(nodes, prevFeatures, false, threshold); return(new HashSet <HtmlNode>(rightRes.Union(leftRes))); }
public override string ToString() { if (_stringRepresentation == null) { if (feature.Count() == 1) { _stringRepresentation = XpathTools.FeatureSetToXpath((new List <Feature>() { this })); } else { _stringRepresentation = String.Join(">", feature); } } return(_stringRepresentation); }
public static void ImproveTree(DecisionNode dn, int level) { double maxScore = 0; Feature maxGainFeature = null;; HashSet <HtmlNode> newFeatureSelected = null; Object lockObj = new object(); double balanceFix = Math.Max(1, (Math.Pow(0.3, Math.Sqrt(level + 1))) * (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count())); double dnEntropy = dn.CalculateEntropy(1, balanceFix); Parallel.ForEach(DomPool.SelectorFeatures, (currCandidate) => { HashSet <Feature> newSelectorSet = new HashSet <Feature>(dn.FeatureSet); newSelectorSet.Add(currCandidate); string currFeatureXpath = XpathTools.FeatureSetToXpath(new HashSet <Feature>() { currCandidate }); HashSet <HtmlNode> currFeatureXpathSelected = DomPool.RunXpathQuery(currFeatureXpath); HashSet <HtmlNode> xpathSelected = new HashSet <HtmlNode>(currFeatureXpathSelected.Intersect(dn.InitialNodeSet)); HashSet <HtmlNode> xpathCurrSelected = new HashSet <HtmlNode>(dn.InitialNodeSet.Intersect(xpathSelected)); HashSet <HtmlNode> xpathCurrNotSelected = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(xpathCurrSelected)); //calculate information gain HashSet <HtmlNode> currSelectedPositive = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedPositive)); HashSet <HtmlNode> currSelectedNegative = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedNegative)); HashSet <HtmlNode> currNotSelectedPositive = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedPositive)); HashSet <HtmlNode> currNotSelectedNegative = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedNegative)); double sp = ((double)currSelectedPositive.Count()) / xpathCurrSelected.Count(); double sn = ((double)currSelectedNegative.Count()) / xpathCurrSelected.Count(); double relativeRecall = ((double)currSelectedPositive.Count()) / ((double)dn.SelectedPositive.Count()); //FIX: sn = sn / (1 + Math.Pow(0, level + 1)); sn = sn / balanceFix; double selectedProbability = ((double)xpathCurrSelected.Count()) / dn.InitialNodeSet.Count(); double selectedEntropy = Statistics.CalculateEntropy(sp, sn); double nsp = ((double)currNotSelectedPositive.Count()) / xpathCurrNotSelected.Count(); double nsn = 1 - nsp; // Apply Fix nsn = nsn / balanceFix; double notselectedProbability = 1 - selectedProbability; double notSelectedEntropy = Statistics.CalculateEntropy(nsp, nsn); double balanceFixProb = balanceFix; double sumTemp = (selectedProbability * sn + selectedProbability * sp * balanceFixProb + notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb); selectedProbability = (selectedProbability * sn + selectedProbability * sp * balanceFixProb) / sumTemp; notselectedProbability = (notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb) / sumTemp; double gain = dnEntropy - ((selectedProbability * selectedEntropy) + (notselectedProbability * notSelectedEntropy)); double phaseOfDecrease = 1; if (DomPool.trainingDocsNames.Count() > 3) { phaseOfDecrease = 3 / DomPool.trainingDocsNames.Count(); } //Choose the most cost effective feature gain = gain / (currCandidate.cost + (((1 - relativeRecall) + (1 - ((double)DomPool.FeatureFrequencey[currCandidate.feature.First().ToLower()]) / DomPool.trainingDocsNames.Count))) * Math.Pow(0.3, level)); lock (lockObj) { if (gain > maxScore && sp > nsp) { maxScore = gain; maxGainFeature = currCandidate; newFeatureSelected = xpathCurrSelected; } } }); if (maxGainFeature == null) { return; } dn.SetSelected = new DecisionNode(); dn.SetSelected.InitialNodeSet = newFeatureSelected; dn.SetSelected.FeatureSet = new HashSet <Feature>(dn.FeatureSet); dn.SetSelected.FeatureSet.Add(maxGainFeature); dn.SetSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedNegative)); dn.SetSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedPositive)); dn.SetSelected.CalculateEntropy(); dn.SetNotSelected = new DecisionNode(); dn.SetNotSelected.InitialNodeSet = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(newFeatureSelected)); //FIX FOR NOT BRANCH, INSTEAD OF HAVING THE NOT. if (FixEnabledForNotBranch) { dn.SetNotSelected.InitialNodeSet.UnionWith(dn.SetSelected.SelectedNegative); } dn.SetNotSelected.FeatureSet = new HashSet <Feature>(dn.FeatureSet); dn.SetNotSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedNegative)); dn.SetNotSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedPositive)); dn.SetNotSelected.CalculateEntropy(); dn.FeatureSet.Add(maxGainFeature); }