public static HashSet <Feature> KeepTopK(HashSet <Feature> featureSet, int k)
        {
            //this line is here to disable the filtering, it costs in performance much more than it saves
            if (featureSet.Count() <= k * 200)
            {
                return(featureSet);
            }
            HashSet <Feature>     finalRes = null;
            LinkedList <object[]> toSort   = new LinkedList <object[]>();

            foreach (Feature f in featureSet)
            {
                HashSet <HtmlNode> res         = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>(new Feature[] { f })));
                HashSet <HtmlNode> selectedPos = new HashSet <HtmlNode>(res.Intersect(DomPool.TargetNodes));
                double             entropy     = Statistics.CalculateEntropy(((double)selectedPos.Count() / res.Count()), 1 - ((double)selectedPos.Count() / res.Count()));
                object[]           toSortObj   = new object[2];
                toSortObj[0] = f;
                toSortObj[1] = entropy;
                toSort.AddFirst(toSortObj);
            }

            var resTopK = toSort.OrderBy(x => ((double)(x[1]))).Select(x => (Feature)(x[0])).Take(k);

            finalRes = new HashSet <Feature>(resTopK.ToList());
            return(finalRes);
        }
        public HashSet <HtmlNode> selectTrue(HashSet <HtmlNode> nodes, HashSet <Feature> prevFeatures, Boolean right, double threshold = 1)
        {
            if (this.precision >= threshold)
            {
                return(nodes);
            }

            HashSet <Feature> currFeature = new HashSet <Feature>(this.FeatureSet.Except(prevFeatures));

            if (currFeature.Count() == 0)
            {
                if (right)
                {
                    return(nodes);
                }
                else
                {
                    return(new HashSet <HtmlNode>());
                }
            }

            Feature            cf         = currFeature.First();
            HashSet <HtmlNode> featureRes = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>()
            {
                cf
            }));

            featureRes.IntersectWith(nodes);
            HashSet <HtmlNode> rightRes = this.SetSelected.selectTrue(featureRes, this.FeatureSet, true, threshold);
            HashSet <HtmlNode> leftRes  = this.SetNotSelected.selectTrue(nodes, prevFeatures, false, threshold);

            return(new HashSet <HtmlNode>(rightRes.Union(leftRes)));
        }
Пример #3
0
        public static void ImproveTree(DecisionNode dn, int level)
        {
            double             maxScore           = 0;
            Feature            maxGainFeature     = null;;
            HashSet <HtmlNode> newFeatureSelected = null;
            Object             lockObj            = new object();
            double             balanceFix         = Math.Max(1, (Math.Pow(0.3, Math.Sqrt(level + 1))) * (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()));
            double             dnEntropy          = dn.CalculateEntropy(1, balanceFix);

            Parallel.ForEach(DomPool.SelectorFeatures, (currCandidate) =>
            {
                HashSet <Feature> newSelectorSet = new HashSet <Feature>(dn.FeatureSet);
                newSelectorSet.Add(currCandidate);
                string currFeatureXpath = XpathTools.FeatureSetToXpath(new HashSet <Feature>()
                {
                    currCandidate
                });
                HashSet <HtmlNode> currFeatureXpathSelected = DomPool.RunXpathQuery(currFeatureXpath);
                HashSet <HtmlNode> xpathSelected            = new HashSet <HtmlNode>(currFeatureXpathSelected.Intersect(dn.InitialNodeSet));
                HashSet <HtmlNode> xpathCurrSelected        = new HashSet <HtmlNode>(dn.InitialNodeSet.Intersect(xpathSelected));
                HashSet <HtmlNode> xpathCurrNotSelected     = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(xpathCurrSelected));

                //calculate information gain
                HashSet <HtmlNode> currSelectedPositive    = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedPositive));
                HashSet <HtmlNode> currSelectedNegative    = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedNegative));
                HashSet <HtmlNode> currNotSelectedPositive = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedPositive));
                HashSet <HtmlNode> currNotSelectedNegative = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedNegative));

                double sp = ((double)currSelectedPositive.Count()) / xpathCurrSelected.Count();
                double sn = ((double)currSelectedNegative.Count()) / xpathCurrSelected.Count();

                double relativeRecall = ((double)currSelectedPositive.Count()) / ((double)dn.SelectedPositive.Count());
                //FIX:
                sn = sn / (1 + Math.Pow(0, level + 1));
                sn = sn / balanceFix;
                double selectedProbability = ((double)xpathCurrSelected.Count()) / dn.InitialNodeSet.Count();

                double selectedEntropy = Statistics.CalculateEntropy(sp, sn);


                double nsp = ((double)currNotSelectedPositive.Count()) / xpathCurrNotSelected.Count();
                double nsn = 1 - nsp;
                // Apply Fix
                nsn = nsn / balanceFix;

                double notselectedProbability = 1 - selectedProbability;
                double notSelectedEntropy     = Statistics.CalculateEntropy(nsp, nsn);

                double balanceFixProb  = balanceFix;
                double sumTemp         = (selectedProbability * sn + selectedProbability * sp * balanceFixProb + notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb);
                selectedProbability    = (selectedProbability * sn + selectedProbability * sp * balanceFixProb) / sumTemp;
                notselectedProbability = (notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb) / sumTemp;
                double gain            = dnEntropy - ((selectedProbability * selectedEntropy) + (notselectedProbability * notSelectedEntropy));


                double phaseOfDecrease = 1;
                if (DomPool.trainingDocsNames.Count() > 3)
                {
                    phaseOfDecrease = 3 / DomPool.trainingDocsNames.Count();
                }

                //Choose the most cost effective feature
                gain = gain / (currCandidate.cost + (((1 - relativeRecall) + (1 - ((double)DomPool.FeatureFrequencey[currCandidate.feature.First().ToLower()]) / DomPool.trainingDocsNames.Count))) * Math.Pow(0.3, level));



                lock (lockObj)
                {
                    if (gain > maxScore && sp > nsp)
                    {
                        maxScore           = gain;
                        maxGainFeature     = currCandidate;
                        newFeatureSelected = xpathCurrSelected;
                    }
                }
            });

            if (maxGainFeature == null)
            {
                return;
            }


            dn.SetSelected = new DecisionNode();
            dn.SetSelected.InitialNodeSet = newFeatureSelected;
            dn.SetSelected.FeatureSet     = new HashSet <Feature>(dn.FeatureSet);
            dn.SetSelected.FeatureSet.Add(maxGainFeature);
            dn.SetSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedNegative));
            dn.SetSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedPositive));
            dn.SetSelected.CalculateEntropy();

            dn.SetNotSelected = new DecisionNode();
            dn.SetNotSelected.InitialNodeSet = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(newFeatureSelected));

            //FIX FOR NOT BRANCH, INSTEAD OF HAVING THE NOT.
            if (FixEnabledForNotBranch)
            {
                dn.SetNotSelected.InitialNodeSet.UnionWith(dn.SetSelected.SelectedNegative);
            }

            dn.SetNotSelected.FeatureSet       = new HashSet <Feature>(dn.FeatureSet);
            dn.SetNotSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedNegative));
            dn.SetNotSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedPositive));
            dn.SetNotSelected.CalculateEntropy();
            dn.FeatureSet.Add(maxGainFeature);
        }
Пример #4
0
        static void Main(string[] args)
        {
            Console.WriteLine("T for test, R for Run, S for seen overall testing and O for overall testing:");
            string res = ReadLine();

            if (res.ToLower().Trim().Equals("huge"))
            {
                TestSites.TestAllSites();
                return;
            }

            if (res.ToLower().Trim().Equals("t"))
            {
                DomPool.LoadDocuments(FILES_LOCATION);
                DomPool.Initiate();

                Console.WriteLine("insert query:");
                string q = ReadLine();
                while (!q.Equals("exit"))
                {
                    var runres = DomPool.RunXpathQuery(q);
                    if (runres != null)
                    {
                        Console.WriteLine("result size" + runres.Count());
                        HashSet <HtmlNode> spos          = new HashSet <HtmlNode>(DomPool.TargetNodes.Intersect(runres));
                        HashSet <HtmlNode> sposprecision = new HashSet <HtmlNode>(DomPool.TargetNodesPrecision.Intersect(runres));
                        foreach (var entry in DomPool.docsAndNames)
                        {
                            HashSet <HtmlNode> docNodes          = new HashSet <HtmlNode>(entry.Value.SelectNodes("//*"));
                            HashSet <HtmlNode> currspos          = new HashSet <HtmlNode>(spos.Intersect(docNodes));
                            HashSet <HtmlNode> currrunres        = new HashSet <HtmlNode>(runres.Intersect(docNodes));
                            HashSet <HtmlNode> currsposprecision = new HashSet <HtmlNode>(sposprecision.Intersect(docNodes));
                            HashSet <HtmlNode> currTargetNodes   = new HashSet <HtmlNode>(DomPool.TargetNodes.Intersect(docNodes));
                            Console.WriteLine(entry.Key + "-Accuracy:" + (currsposprecision.Count() / ((double)currrunres.Count())) + ". Recall:" + (currspos.Count() / ((double)currTargetNodes.Count())) + "");
                        }

                        Console.WriteLine("Accuracy:" + (sposprecision.Count() / ((double)runres.Count())) + ". Recall:" + (spos.Count() / ((double)DomPool.TargetNodes.Count())) + "");
                    }
                    else
                    {
                        Console.WriteLine("null");
                    }
                    Console.WriteLine("insert query:");
                    q = ReadLine();
                }
            }
            else
            {
                if (res.ToLower().Trim().Equals("r"))
                {
                    Console.WriteLine(LearnXpathWrapper.LearnXpathFromTrainingFiles(FILES_LOCATION));
                    Console.ReadLine();
                }
                else
                {
                    if (res.ToLower().Trim().Equals("s"))
                    {
                        Console.WriteLine("Output is redirected to resultsSeen.txt in the debug dir");
                        //write results to text file instead of windows
                        FileStream   fs  = new FileStream("resultsSeen.txt", FileMode.Create);
                        StreamWriter sw  = new StreamWriter(fs);
                        TextWriter   tmp = Console.Out;
                        Console.SetOut(sw);

                        OverallSeenTesting.RunTest(FILES_LOCATION);
                        Console.SetOut(tmp);
                        sw.Flush();
                        sw.Close();
                    }
                    else
                    {
                        if (res.ToLower().Trim().Equals("archive"))
                        {
                            Console.WriteLine("Output is redirected to results.txt in the debug dir");
                            //write results to text file instead of windows
                            FileStream   fs  = new FileStream("archive2-results.txt", FileMode.Create);
                            StreamWriter sw  = new StreamWriter(fs);
                            TextWriter   tmp = Console.Out;
                            Console.SetOut(sw);

                            OverallArchive2Testing.RunTest(ARCHIVE_FILES_LOCATION);
                            Console.SetOut(tmp);
                            sw.Flush();
                            sw.Close();
                        }
                        else
                        {
                            if (res.ToLower().Trim().Equals("a"))
                            {
                                Console.WriteLine("Please enter file name to parse:");
                                string fnp = ReadLine().Trim();
                                parseres.learn(fnp);
                                parseres.save("parsed" + fnp);
                            }
                            else
                            {
                                Console.WriteLine("Output is redirected to results.txt in the debug dir");
                                //write results to text file instead of windows
                                FileStream   fs  = new FileStream("results.txt", FileMode.Create);
                                StreamWriter sw  = new StreamWriter(fs);
                                TextWriter   tmp = Console.Out;
                                Console.SetOut(sw);

                                OverallTesting.RunTest(FILES_LOCATION);
                                Console.SetOut(tmp);
                                sw.Flush();
                                sw.Close();
                            }
                        }
                    }
                }
            }
        }
        public void LearnModel()
        {
            Init();
            foreach (Feature currFeature in DomPool.SelectorFeatures)
            {
                String             featureString = currFeature.ToString();
                HashSet <HtmlNode> resNodes      = DomPool.RunXpathQuery(featureString);
                foreach (HtmlNode nd in resNodes)
                {
                    if (!allNodes.Contains(nd))
                    {
                        continue;
                    }
                    nodeFeatures[nd].Add(featureString);
                }
            }

            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  trainingSet      = new Instances("TS", fvWekaAttributes, 100);

            trainingSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in allNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (nodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no"));
                item.setDataset(trainingSet);
                if (DomPool.TargetNodes.Contains(currNode))
                {
                    for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++)
                    {
                        trainingSet.add(new SparseInstance(item));
                    }
                }
                else
                {
                    trainingSet.add(item);
                }
            }

            String[] options = new String[2];
            options[0] = "-C";                 // unpruned tree
            options[1] = "0.1";
            J48 tree = new J48();              // new instance of tree

            tree.setOptions(options);          // set the options
            tree.buildClassifier(trainingSet); // build classifier
            //save the resulting classifier
            classifierTree = tree;

            Reader    treeDot   = new StringReader(tree.graph());
            TreeBuild treeBuild = new TreeBuild();
            Node      treeRoot  = treeBuild.create(treeDot);

            FeaturesUsed = getTreeFeatures(treeRoot);
        }
Пример #6
0
        public void LearnModel()
        {
            Init();
            foreach (Feature currFeature in DomPool.SelectorFeatures)
            {
                String             featureString = currFeature.ToString();
                HashSet <HtmlNode> resNodes      = DomPool.RunXpathQuery(featureString);
                foreach (HtmlNode nd in resNodes)
                {
                    if (!allNodes.Contains(nd))
                    {
                        continue;
                    }
                    nodeFeatures[nd].Add(featureString);
                }
            }
            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  trainingSet      = new Instances("TS", fvWekaAttributes, 10);

            trainingSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in allNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (nodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no"));
                item.setDataset(trainingSet);
                if (DomPool.TargetNodes.Contains(currNode))
                {
                    for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++)
                    {
                        trainingSet.add(new SparseInstance(item));
                    }
                }
                else
                {
                    trainingSet.add(item);
                }
            }

            //String[] options = new String[2];
            //options = new string[] { "-C", "0.05" };            // unpruned tree
            NaiveBayes cls = new NaiveBayes();         // new instance of tree

            //cls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\""));
            //cls.setOptions(options);     // set the options
            cls.buildClassifier(trainingSet);  // build classifier
            //save the resulting classifier
            classifier = cls;

            //  Reader treeDot = new StringReader(tree.graph());
            //  TreeBuild treeBuild = new TreeBuild();
            //  Node treeRoot = treeBuild.create(treeDot);
            FeaturesUsed = new HashSet <string>();

            foreach (Feature f in DomPool.SelectorFeatures)
            {
                FeaturesUsed.Add(f.ToString());
            }
        }