Esempio n. 1
0
        public static void RunTest(string filesLocation)
        {
            string[] folders = Directory.GetDirectories(filesLocation);
            foreach (string fldr in folders)
            {
                Console.WriteLine("Running for category:" + fldr);
                string[] innerfolders = Directory.GetDirectories(fldr);
                foreach (string innerdir in innerfolders)
                {
                    Console.Out.Flush();
                    Console.WriteLine("Running for att:" + innerdir);

                    DomPool.LoadTestDocuments(innerdir.Replace(filesLocation, "testset"));
                    DomPool.LoadDocuments(innerdir);
                    //for(int i= (DomPool.allDocsNames.Count() - 1); i <= (DomPool.allDocsNames.Count()-1)/*DomPool.allDocsNames.Count()*/; i++)
                    for (int i = 1; i <= (DomPool.allDocsNames.Count() - 1); i++)
                    {
                        string[] tools     = new string[] { "our", "our - not forgiving", "j48", "svm", "xpath-align", "svm" };
                        int      toolStart = 5;
                        Dictionary <string, string> xpathNonForgiving = new Dictionary <string, string>();
                        for (int tool = toolStart; tool < 6; tool++)
                        {
                            Console.WriteLine("[-] running for training set size=" + i);
                            IEnumerable <IEnumerable <int> > subsetsIndexes = Subsets(DomPool.allDocsNames.Count(), i);
                            //Reduce size ...for testing only
                            //subsetsIndexes = subsetsIndexes.Take(30);
                            double totalAccuracy = 0;
                            double totalRecall   = 0;
                            long   totalTime     = 0;
                            Console.WriteLine("[-] tool:" + tools[tool]);
                            Console.WriteLine("+ will run " + subsetsIndexes.Count() + " different iterations for the current set size");
                            int s = 0;
                            Dictionary <String, double> SiteTotalRecall    = new Dictionary <string, double>();
                            Dictionary <String, double> SiteTotalPrecision = new Dictionary <string, double>();
                            Dictionary <String, double> SiteTotalTests     = new Dictionary <string, double>();
                            foreach (string site in DomPool.allDocsNames)
                            {
                                SiteTotalPrecision[site] = 0;
                                SiteTotalRecall[site]    = 0;
                                SiteTotalTests[site]     = 0;
                            }


                            foreach (IEnumerable <int> currSubsetIndexes in subsetsIndexes)
                            {
                                List <int> listRep   = new List <int>(currSubsetIndexes);
                                string     stringRep = listRep.Aggregate("", (b, x) => b + "," + x);
                                s++;
                                if (s % 10 == 0)
                                {
                                    //Console.Write("(" + s + "/" + subsetsIndexes.Count() + ") ");

                                    Console.Write(".");
                                }
                                //if (tool == toolStart)
                                //{
                                HashSet <String> currSubset = GetSubSet(DomPool.allDocsNames, currSubsetIndexes);
                                DomPool.Initiate(currSubset);
                                DomPool.ExtractAllFeatures();
                                //}
                                var runres = new HashSet <HtmlNode>();
                                //our method
                                if (tool < 2)
                                {
                                    string xpath = "";
                                    if (tool == 0)
                                    {
                                        DecisionNode dn = new DecisionNode();
                                        dn.InitialNodeSet   = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes));
                                        dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision));
                                        dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes);
                                        dn.FeatureSet       = new HashSet <Feature>();
                                        dn.CalculateEntropy();

                                        DecisionTreeLearning.RecursiveTreeImprovement(dn);


                                        xpath = XpathTools.GenerateAForgivingXpath(dn);

                                        xpathNonForgiving[stringRep] = XpathTools.DecisionTreeToXpath(dn, new HashSet <Feature>(), 1);
                                        xpathNonForgiving[stringRep] = "//*" + (xpathNonForgiving[stringRep].Equals("") ? "" : ("[" + xpathNonForgiving[stringRep] + "]"));
                                    }

                                    if (tool == 1)
                                    {
                                        xpath = xpathNonForgiving[stringRep];
                                    }

                                    Console.WriteLine("Query:" + xpath);

                                    var watch = Stopwatch.StartNew();
                                    runres = DomPool.TESTRunXpathQuery(xpath);
                                    watch.Stop();
                                    var elapsedMs = watch.ElapsedMilliseconds;
                                    totalTime = totalTime + elapsedMs;
                                }
                                else
                                {
                                    if (tool == 2)
                                    {
                                        ModelLearner model = new ModelLearner();
                                        model.LearnModel();
                                        var watch = Stopwatch.StartNew();
                                        runres = model.RunOnTestSet();

                                        watch.Stop();
                                        var elapsedMs = watch.ElapsedMilliseconds;
                                        totalTime = totalTime + elapsedMs;
                                    }
                                    else
                                    {
                                        if (tool == 3)
                                        {
                                            NB model = new NB();
                                            model.LearnModel();
                                            var watch = Stopwatch.StartNew();
                                            runres = model.RunOnTestSet();
                                            watch.Stop();
                                            var elapsedMs = watch.ElapsedMilliseconds;
                                            totalTime = totalTime + elapsedMs;
                                        }
                                        else
                                        {
                                            if (tool == 4)
                                            {
                                                XpathAlignment model = new XpathAlignment();
                                                model.LearnModel();
                                                var watch = Stopwatch.StartNew();
                                                runres = model.RunOnTestSet();
                                                watch.Stop();
                                                var elapsedMs = watch.ElapsedMilliseconds;
                                                totalTime = totalTime + elapsedMs;
                                            }
                                            else
                                            {
                                                SVM model = new SVM();
                                                model.LearnModel();
                                                var watch = Stopwatch.StartNew();
                                                runres = model.RunOnTestSet();
                                                watch.Stop();
                                                var elapsedMs = watch.ElapsedMilliseconds;
                                                totalTime = totalTime + elapsedMs;
                                            }
                                        }
                                    }
                                }


                                HashSet <HtmlNode> spos          = new HashSet <HtmlNode>(DomPool.TESTTargetNodes.Intersect(runres));
                                HashSet <HtmlNode> sposprecision = new HashSet <HtmlNode>(DomPool.TESTTargetNodesPrecision.Intersect(runres));

                                foreach (var entry in DomPool.docsAndNames)
                                {
                                    if (DomPool.trainingDocsNames.Contains(entry.Key))
                                    {
                                        continue;
                                    }

                                    HashSet <HtmlNode> docNodes          = new HashSet <HtmlNode>(entry.Value.SelectNodes("//*"));
                                    HashSet <HtmlNode> currspos          = new HashSet <HtmlNode>(spos.Intersect(docNodes));
                                    HashSet <HtmlNode> currrunres        = new HashSet <HtmlNode>(runres.Intersect(docNodes));
                                    HashSet <HtmlNode> currsposprecision = new HashSet <HtmlNode>(sposprecision.Intersect(docNodes));
                                    HashSet <HtmlNode> currTargetNodes   = new HashSet <HtmlNode>(DomPool.TESTTargetNodes.Intersect(docNodes));
                                    double             currSiteAccuracy  = (currsposprecision.Count() / ((double)currrunres.Count()));
                                    double             currSiteRecall    = (currspos.Count() / ((double)currTargetNodes.Count()));
                                    if (((double)currrunres.Count()) > 0)
                                    {
                                        SiteTotalPrecision[entry.Key] = SiteTotalPrecision[entry.Key] + currSiteAccuracy;
                                        SiteTotalRecall[entry.Key]    = SiteTotalRecall[entry.Key] + currSiteRecall;
                                    }

                                    SiteTotalTests[entry.Key] = SiteTotalTests[entry.Key] + 1;
                                }

                                double currAccuracy = (sposprecision.Count() / ((double)runres.Count()));
                                double currRecall   = (spos.Count() / ((double)DomPool.TESTTargetNodes.Count()));
                                if (runres.Count() > 0)
                                {
                                    totalAccuracy = totalAccuracy + currAccuracy;
                                    totalRecall   = totalRecall + currRecall;
                                }
                            }

                            totalAccuracy = totalAccuracy / subsetsIndexes.Count();
                            totalRecall   = totalRecall / subsetsIndexes.Count();
                            Console.WriteLine("########## Results " + tools[tool] + " for i=" + i + "##########");

                            Console.WriteLine("+++++++++ Detailed Results for i=" + i + "++++++++++#");
                            double count             = 0;
                            double totalSumPrecision = 0;
                            double totalSumRecall    = 0;
                            double avgRecall         = 0;
                            double avgPrecision      = 0;
                            double avgFscore         = 0;
                            double numPrecision      = 0;

                            foreach (string site in DomPool.allDocsNames)
                            {
                                if (SiteTotalTests[site] < 1)
                                {
                                    SiteTotalTests[site]++;
                                }
                                else
                                {
                                    numPrecision++;
                                }

                                double sitePrecision = SiteTotalPrecision[site] / SiteTotalTests[site];
                                double siteRecall    = SiteTotalRecall[site] / SiteTotalTests[site];
                                double siteFscore    = 2 * (sitePrecision * siteRecall) / (sitePrecision + siteRecall);
                                if (siteRecall == 0 && sitePrecision == 0)
                                {
                                    siteFscore = 0;
                                }

                                count++;
                                avgRecall    = avgRecall + siteRecall;
                                avgPrecision = avgPrecision + sitePrecision;
                                avgFscore    = avgFscore + siteFscore;

                                Console.WriteLine(">" + site + ": Precision:" + sitePrecision + " , Recall:" + siteRecall + ", F-score:" + siteFscore);
                            }
                            Console.WriteLine("++++++++++++++++Total+++++++++++++++++");
                            avgRecall    = avgRecall / count;
                            avgPrecision = avgPrecision / numPrecision;
                            avgFscore    = avgFscore / count;

                            Console.WriteLine("Recall:" + avgRecall);
                            Console.WriteLine("Precision:" + avgPrecision);
                            Console.WriteLine("F-score:" + avgFscore);
                            Console.WriteLine("Time:" + totalTime);
                        }
                    }
                }
            }

            Console.ReadLine();
        }