예제 #1
0
        public static HashSet <Feature> GeneralizeTextFeatures(String text)
        {
            HashSet <Feature> res = new HashSet <Feature>();
            Feature           f   = new Feature();

            f.type    = Feature.FeatureType.Selector;
            f.feature = new List <string>()
            {
                "text()='" + XpathTools.EscapeString(text.Trim()) + "'"
            };
            f.cost = FeatureCosts.TEXT_EQUAL;
            res.Add(f);
            Regex reg = new Regex("(?=([A-Z\\-\\s]))");

            string[] splitted = reg.Split(XpathTools.EscapeString(text));
            foreach (var split in splitted)
            {
                var curr = split.Replace("-", "").Trim();
                if (curr.Length < 2)
                {
                    continue;
                }
                Feature f_contains = new Feature();
                f_contains.type    = Feature.FeatureType.Selector;
                f_contains.feature = new List <string>()
                {
                    "contains(text(),'" + curr + "')"
                };
                f_contains.cost = FeatureCosts.TEXT_CONTAINS;
                res.Add(f_contains);
            }
            return(res);
        }
        public static HashSet <Feature> KeepTopK(HashSet <Feature> featureSet, int k)
        {
            //this line is here to disable the filtering, it costs in performance much more than it saves
            if (featureSet.Count() <= k * 200)
            {
                return(featureSet);
            }
            HashSet <Feature>     finalRes = null;
            LinkedList <object[]> toSort   = new LinkedList <object[]>();

            foreach (Feature f in featureSet)
            {
                HashSet <HtmlNode> res         = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>(new Feature[] { f })));
                HashSet <HtmlNode> selectedPos = new HashSet <HtmlNode>(res.Intersect(DomPool.TargetNodes));
                double             entropy     = Statistics.CalculateEntropy(((double)selectedPos.Count() / res.Count()), 1 - ((double)selectedPos.Count() / res.Count()));
                object[]           toSortObj   = new object[2];
                toSortObj[0] = f;
                toSortObj[1] = entropy;
                toSort.AddFirst(toSortObj);
            }

            var resTopK = toSort.OrderBy(x => ((double)(x[1]))).Select(x => (Feature)(x[0])).Take(k);

            finalRes = new HashSet <Feature>(resTopK.ToList());
            return(finalRes);
        }
        public HashSet <HtmlNode> selectTrue(HashSet <HtmlNode> nodes, HashSet <Feature> prevFeatures, Boolean right, double threshold = 1)
        {
            if (this.precision >= threshold)
            {
                return(nodes);
            }

            HashSet <Feature> currFeature = new HashSet <Feature>(this.FeatureSet.Except(prevFeatures));

            if (currFeature.Count() == 0)
            {
                if (right)
                {
                    return(nodes);
                }
                else
                {
                    return(new HashSet <HtmlNode>());
                }
            }

            Feature            cf         = currFeature.First();
            HashSet <HtmlNode> featureRes = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>()
            {
                cf
            }));

            featureRes.IntersectWith(nodes);
            HashSet <HtmlNode> rightRes = this.SetSelected.selectTrue(featureRes, this.FeatureSet, true, threshold);
            HashSet <HtmlNode> leftRes  = this.SetNotSelected.selectTrue(nodes, prevFeatures, false, threshold);

            return(new HashSet <HtmlNode>(rightRes.Union(leftRes)));
        }
예제 #4
0
 public override string ToString()
 {
     if (_stringRepresentation == null)
     {
         if (feature.Count() == 1)
         {
             _stringRepresentation = XpathTools.FeatureSetToXpath((new List <Feature>()
             {
                 this
             }));
         }
         else
         {
             _stringRepresentation = String.Join(">", feature);
         }
     }
     return(_stringRepresentation);
 }
        public static string LearnXpathFromTrainingFiles(string filesLocation)
        {
            DomPool.LoadDocuments(filesLocation);
            DomPool.Initiate();
            DomPool.ExtractAllFeatures();

            DecisionNode dn = new DecisionNode();

            dn.InitialNodeSet   = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes));
            dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision));
            dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes);
            dn.FeatureSet       = new HashSet <Feature>();
            dn.CalculateEntropy();

            DecisionTreeLearning.RecursiveTreeImprovement(dn);

            return(XpathTools.GenerateAForgivingXpath(dn));
            //"//*["+XpathTools.DecisionTreeToXpath(dn,new HashSet<Feature>())+"]";
        }
예제 #6
0
        public static void RunTest(string filesLocation)
        {
            string[] folders = Directory.GetDirectories(filesLocation);
            foreach (string fldr in folders)
            {
                Console.WriteLine("Running for category:" + fldr);
                string[] innerfolders = Directory.GetDirectories(fldr);
                foreach (string innerdir in innerfolders)
                {
                    Console.Out.Flush();
                    Console.WriteLine("Running for att:" + innerdir);

                    DomPool.LoadTestDocuments(innerdir.Replace(filesLocation, "testset"));
                    DomPool.LoadDocuments(innerdir);
                    //for(int i= (DomPool.allDocsNames.Count() - 1); i <= (DomPool.allDocsNames.Count()-1)/*DomPool.allDocsNames.Count()*/; i++)
                    for (int i = 1; i <= (DomPool.allDocsNames.Count() - 1); i++)
                    {
                        string[] tools     = new string[] { "our", "our - not forgiving", "j48", "svm", "xpath-align", "svm" };
                        int      toolStart = 5;
                        Dictionary <string, string> xpathNonForgiving = new Dictionary <string, string>();
                        for (int tool = toolStart; tool < 6; tool++)
                        {
                            Console.WriteLine("[-] running for training set size=" + i);
                            IEnumerable <IEnumerable <int> > subsetsIndexes = Subsets(DomPool.allDocsNames.Count(), i);
                            //Reduce size ...for testing only
                            //subsetsIndexes = subsetsIndexes.Take(30);
                            double totalAccuracy = 0;
                            double totalRecall   = 0;
                            long   totalTime     = 0;
                            Console.WriteLine("[-] tool:" + tools[tool]);
                            Console.WriteLine("+ will run " + subsetsIndexes.Count() + " different iterations for the current set size");
                            int s = 0;
                            Dictionary <String, double> SiteTotalRecall    = new Dictionary <string, double>();
                            Dictionary <String, double> SiteTotalPrecision = new Dictionary <string, double>();
                            Dictionary <String, double> SiteTotalTests     = new Dictionary <string, double>();
                            foreach (string site in DomPool.allDocsNames)
                            {
                                SiteTotalPrecision[site] = 0;
                                SiteTotalRecall[site]    = 0;
                                SiteTotalTests[site]     = 0;
                            }


                            foreach (IEnumerable <int> currSubsetIndexes in subsetsIndexes)
                            {
                                List <int> listRep   = new List <int>(currSubsetIndexes);
                                string     stringRep = listRep.Aggregate("", (b, x) => b + "," + x);
                                s++;
                                if (s % 10 == 0)
                                {
                                    //Console.Write("(" + s + "/" + subsetsIndexes.Count() + ") ");

                                    Console.Write(".");
                                }
                                //if (tool == toolStart)
                                //{
                                HashSet <String> currSubset = GetSubSet(DomPool.allDocsNames, currSubsetIndexes);
                                DomPool.Initiate(currSubset);
                                DomPool.ExtractAllFeatures();
                                //}
                                var runres = new HashSet <HtmlNode>();
                                //our method
                                if (tool < 2)
                                {
                                    string xpath = "";
                                    if (tool == 0)
                                    {
                                        DecisionNode dn = new DecisionNode();
                                        dn.InitialNodeSet   = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes));
                                        dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision));
                                        dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes);
                                        dn.FeatureSet       = new HashSet <Feature>();
                                        dn.CalculateEntropy();

                                        DecisionTreeLearning.RecursiveTreeImprovement(dn);


                                        xpath = XpathTools.GenerateAForgivingXpath(dn);

                                        xpathNonForgiving[stringRep] = XpathTools.DecisionTreeToXpath(dn, new HashSet <Feature>(), 1);
                                        xpathNonForgiving[stringRep] = "//*" + (xpathNonForgiving[stringRep].Equals("") ? "" : ("[" + xpathNonForgiving[stringRep] + "]"));
                                    }

                                    if (tool == 1)
                                    {
                                        xpath = xpathNonForgiving[stringRep];
                                    }

                                    Console.WriteLine("Query:" + xpath);

                                    var watch = Stopwatch.StartNew();
                                    runres = DomPool.TESTRunXpathQuery(xpath);
                                    watch.Stop();
                                    var elapsedMs = watch.ElapsedMilliseconds;
                                    totalTime = totalTime + elapsedMs;
                                }
                                else
                                {
                                    if (tool == 2)
                                    {
                                        ModelLearner model = new ModelLearner();
                                        model.LearnModel();
                                        var watch = Stopwatch.StartNew();
                                        runres = model.RunOnTestSet();

                                        watch.Stop();
                                        var elapsedMs = watch.ElapsedMilliseconds;
                                        totalTime = totalTime + elapsedMs;
                                    }
                                    else
                                    {
                                        if (tool == 3)
                                        {
                                            NB model = new NB();
                                            model.LearnModel();
                                            var watch = Stopwatch.StartNew();
                                            runres = model.RunOnTestSet();
                                            watch.Stop();
                                            var elapsedMs = watch.ElapsedMilliseconds;
                                            totalTime = totalTime + elapsedMs;
                                        }
                                        else
                                        {
                                            if (tool == 4)
                                            {
                                                XpathAlignment model = new XpathAlignment();
                                                model.LearnModel();
                                                var watch = Stopwatch.StartNew();
                                                runres = model.RunOnTestSet();
                                                watch.Stop();
                                                var elapsedMs = watch.ElapsedMilliseconds;
                                                totalTime = totalTime + elapsedMs;
                                            }
                                            else
                                            {
                                                SVM model = new SVM();
                                                model.LearnModel();
                                                var watch = Stopwatch.StartNew();
                                                runres = model.RunOnTestSet();
                                                watch.Stop();
                                                var elapsedMs = watch.ElapsedMilliseconds;
                                                totalTime = totalTime + elapsedMs;
                                            }
                                        }
                                    }
                                }


                                HashSet <HtmlNode> spos          = new HashSet <HtmlNode>(DomPool.TESTTargetNodes.Intersect(runres));
                                HashSet <HtmlNode> sposprecision = new HashSet <HtmlNode>(DomPool.TESTTargetNodesPrecision.Intersect(runres));

                                foreach (var entry in DomPool.docsAndNames)
                                {
                                    if (DomPool.trainingDocsNames.Contains(entry.Key))
                                    {
                                        continue;
                                    }

                                    HashSet <HtmlNode> docNodes          = new HashSet <HtmlNode>(entry.Value.SelectNodes("//*"));
                                    HashSet <HtmlNode> currspos          = new HashSet <HtmlNode>(spos.Intersect(docNodes));
                                    HashSet <HtmlNode> currrunres        = new HashSet <HtmlNode>(runres.Intersect(docNodes));
                                    HashSet <HtmlNode> currsposprecision = new HashSet <HtmlNode>(sposprecision.Intersect(docNodes));
                                    HashSet <HtmlNode> currTargetNodes   = new HashSet <HtmlNode>(DomPool.TESTTargetNodes.Intersect(docNodes));
                                    double             currSiteAccuracy  = (currsposprecision.Count() / ((double)currrunres.Count()));
                                    double             currSiteRecall    = (currspos.Count() / ((double)currTargetNodes.Count()));
                                    if (((double)currrunres.Count()) > 0)
                                    {
                                        SiteTotalPrecision[entry.Key] = SiteTotalPrecision[entry.Key] + currSiteAccuracy;
                                        SiteTotalRecall[entry.Key]    = SiteTotalRecall[entry.Key] + currSiteRecall;
                                    }

                                    SiteTotalTests[entry.Key] = SiteTotalTests[entry.Key] + 1;
                                }

                                double currAccuracy = (sposprecision.Count() / ((double)runres.Count()));
                                double currRecall   = (spos.Count() / ((double)DomPool.TESTTargetNodes.Count()));
                                if (runres.Count() > 0)
                                {
                                    totalAccuracy = totalAccuracy + currAccuracy;
                                    totalRecall   = totalRecall + currRecall;
                                }
                            }

                            totalAccuracy = totalAccuracy / subsetsIndexes.Count();
                            totalRecall   = totalRecall / subsetsIndexes.Count();
                            Console.WriteLine("########## Results " + tools[tool] + " for i=" + i + "##########");

                            Console.WriteLine("+++++++++ Detailed Results for i=" + i + "++++++++++#");
                            double count             = 0;
                            double totalSumPrecision = 0;
                            double totalSumRecall    = 0;
                            double avgRecall         = 0;
                            double avgPrecision      = 0;
                            double avgFscore         = 0;
                            double numPrecision      = 0;

                            foreach (string site in DomPool.allDocsNames)
                            {
                                if (SiteTotalTests[site] < 1)
                                {
                                    SiteTotalTests[site]++;
                                }
                                else
                                {
                                    numPrecision++;
                                }

                                double sitePrecision = SiteTotalPrecision[site] / SiteTotalTests[site];
                                double siteRecall    = SiteTotalRecall[site] / SiteTotalTests[site];
                                double siteFscore    = 2 * (sitePrecision * siteRecall) / (sitePrecision + siteRecall);
                                if (siteRecall == 0 && sitePrecision == 0)
                                {
                                    siteFscore = 0;
                                }

                                count++;
                                avgRecall    = avgRecall + siteRecall;
                                avgPrecision = avgPrecision + sitePrecision;
                                avgFscore    = avgFscore + siteFscore;

                                Console.WriteLine(">" + site + ": Precision:" + sitePrecision + " , Recall:" + siteRecall + ", F-score:" + siteFscore);
                            }
                            Console.WriteLine("++++++++++++++++Total+++++++++++++++++");
                            avgRecall    = avgRecall / count;
                            avgPrecision = avgPrecision / numPrecision;
                            avgFscore    = avgFscore / count;

                            Console.WriteLine("Recall:" + avgRecall);
                            Console.WriteLine("Precision:" + avgPrecision);
                            Console.WriteLine("F-score:" + avgFscore);
                            Console.WriteLine("Time:" + totalTime);
                        }
                    }
                }
            }

            Console.ReadLine();
        }
예제 #7
0
        public static void ImproveTree(DecisionNode dn, int level)
        {
            double             maxScore           = 0;
            Feature            maxGainFeature     = null;;
            HashSet <HtmlNode> newFeatureSelected = null;
            Object             lockObj            = new object();
            double             balanceFix         = Math.Max(1, (Math.Pow(0.3, Math.Sqrt(level + 1))) * (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()));
            double             dnEntropy          = dn.CalculateEntropy(1, balanceFix);

            Parallel.ForEach(DomPool.SelectorFeatures, (currCandidate) =>
            {
                HashSet <Feature> newSelectorSet = new HashSet <Feature>(dn.FeatureSet);
                newSelectorSet.Add(currCandidate);
                string currFeatureXpath = XpathTools.FeatureSetToXpath(new HashSet <Feature>()
                {
                    currCandidate
                });
                HashSet <HtmlNode> currFeatureXpathSelected = DomPool.RunXpathQuery(currFeatureXpath);
                HashSet <HtmlNode> xpathSelected            = new HashSet <HtmlNode>(currFeatureXpathSelected.Intersect(dn.InitialNodeSet));
                HashSet <HtmlNode> xpathCurrSelected        = new HashSet <HtmlNode>(dn.InitialNodeSet.Intersect(xpathSelected));
                HashSet <HtmlNode> xpathCurrNotSelected     = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(xpathCurrSelected));

                //calculate information gain
                HashSet <HtmlNode> currSelectedPositive    = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedPositive));
                HashSet <HtmlNode> currSelectedNegative    = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedNegative));
                HashSet <HtmlNode> currNotSelectedPositive = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedPositive));
                HashSet <HtmlNode> currNotSelectedNegative = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedNegative));

                double sp = ((double)currSelectedPositive.Count()) / xpathCurrSelected.Count();
                double sn = ((double)currSelectedNegative.Count()) / xpathCurrSelected.Count();

                double relativeRecall = ((double)currSelectedPositive.Count()) / ((double)dn.SelectedPositive.Count());
                //FIX:
                sn = sn / (1 + Math.Pow(0, level + 1));
                sn = sn / balanceFix;
                double selectedProbability = ((double)xpathCurrSelected.Count()) / dn.InitialNodeSet.Count();

                double selectedEntropy = Statistics.CalculateEntropy(sp, sn);


                double nsp = ((double)currNotSelectedPositive.Count()) / xpathCurrNotSelected.Count();
                double nsn = 1 - nsp;
                // Apply Fix
                nsn = nsn / balanceFix;

                double notselectedProbability = 1 - selectedProbability;
                double notSelectedEntropy     = Statistics.CalculateEntropy(nsp, nsn);

                double balanceFixProb  = balanceFix;
                double sumTemp         = (selectedProbability * sn + selectedProbability * sp * balanceFixProb + notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb);
                selectedProbability    = (selectedProbability * sn + selectedProbability * sp * balanceFixProb) / sumTemp;
                notselectedProbability = (notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb) / sumTemp;
                double gain            = dnEntropy - ((selectedProbability * selectedEntropy) + (notselectedProbability * notSelectedEntropy));


                double phaseOfDecrease = 1;
                if (DomPool.trainingDocsNames.Count() > 3)
                {
                    phaseOfDecrease = 3 / DomPool.trainingDocsNames.Count();
                }

                //Choose the most cost effective feature
                gain = gain / (currCandidate.cost + (((1 - relativeRecall) + (1 - ((double)DomPool.FeatureFrequencey[currCandidate.feature.First().ToLower()]) / DomPool.trainingDocsNames.Count))) * Math.Pow(0.3, level));



                lock (lockObj)
                {
                    if (gain > maxScore && sp > nsp)
                    {
                        maxScore           = gain;
                        maxGainFeature     = currCandidate;
                        newFeatureSelected = xpathCurrSelected;
                    }
                }
            });

            if (maxGainFeature == null)
            {
                return;
            }


            dn.SetSelected = new DecisionNode();
            dn.SetSelected.InitialNodeSet = newFeatureSelected;
            dn.SetSelected.FeatureSet     = new HashSet <Feature>(dn.FeatureSet);
            dn.SetSelected.FeatureSet.Add(maxGainFeature);
            dn.SetSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedNegative));
            dn.SetSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedPositive));
            dn.SetSelected.CalculateEntropy();

            dn.SetNotSelected = new DecisionNode();
            dn.SetNotSelected.InitialNodeSet = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(newFeatureSelected));

            //FIX FOR NOT BRANCH, INSTEAD OF HAVING THE NOT.
            if (FixEnabledForNotBranch)
            {
                dn.SetNotSelected.InitialNodeSet.UnionWith(dn.SetSelected.SelectedNegative);
            }

            dn.SetNotSelected.FeatureSet       = new HashSet <Feature>(dn.FeatureSet);
            dn.SetNotSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedNegative));
            dn.SetNotSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedPositive));
            dn.SetNotSelected.CalculateEntropy();
            dn.FeatureSet.Add(maxGainFeature);
        }
예제 #8
0
        public static void PorcessSites(List <SiteInfo> siteinfos)
        {
            foreach (SiteInfo si in siteinfos)
            {
                try {
                    int leftPages             = pagesNum * si.attributeExtraction.Keys.Count();
                    List <HtmlNode> sitePages = new List <HtmlNode>(0);
                    if (!SiteDocuments.ContainsKey(si.SiteName))
                    {
                        SiteDocuments[si.SiteName] = new Dictionary <string, Dictionary <String, HtmlNode> >();
                        SiteLinks[si.SiteName]     = new Dictionary <string, HashSet <String> >();
                        foreach (String attrName in si.attributeExtraction.Keys)
                        {
                            SiteDocuments[si.SiteName][attrName] = new Dictionary <string, HtmlNode>();
                            SiteLinks[si.SiteName][attrName]     = new HashSet <string>();
                        }
                    }

                    //Download all URLs
                    foreach (String startURL in si.StartURLs)
                    {
                        try
                        {
                            String currURL = startURL;
                            while (currURL != null)
                            {
                                HtmlDocument doc = new HtmlDocument();
                                var currHTML     = URLDownloader.GetHtmlOfURL(currURL);
                                doc.LoadHtml(currHTML);
                                if (!String.IsNullOrEmpty(si.PageExtractionXpath))
                                {
                                    var links = doc.DocumentNode.SelectNodes(si.PageExtractionXpath);
                                    foreach (HtmlNode lnk in links)
                                    {
                                        try
                                        {
                                            String pageLink = lnk.Attributes["href"].Value;

                                            // var htmlstr = URLDownloader.GetHtmlOfURL(URLDownloader.UrlFixIfRelative(pageLink, currURL));
                                            foreach (string attr in si.attributeExtraction.Keys)
                                            {
                                                var newURL = URLDownloader.UrlFixIfRelative(pageLink, currURL);
                                                if (SiteLinks[si.SiteName][attr].Contains(newURL))
                                                {
                                                    continue;
                                                }
                                                //HtmlDocument innerDoc = new HtmlDocument();
                                                //innerDoc.LoadHtml(htmlstr);
                                                //SiteDocuments[si.SiteName][attr].Add(pageLink, innerDoc.DocumentNode);
                                                SiteLinks[si.SiteName][attr].Add(newURL);
                                                if (--leftPages <= 0)
                                                {
                                                    break;
                                                }
                                            }
                                            if (leftPages <= 0)
                                            {
                                                break;
                                            }
                                        }
                                        catch (Exception e) {
                                            // Console.WriteLine(e.StackTrace);
                                        }
                                    }
                                }
                                else
                                {
                                    foreach (string attr in si.attributeExtraction.Keys)
                                    {
                                        try
                                        {
                                            if (SiteLinks[si.SiteName][attr].Contains(currURL))
                                            {
                                                continue;
                                            }
                                            // HtmlDocument innerDoc = new HtmlDocument();
                                            // innerDoc.LoadHtml(currHTML);
                                            // SiteDocuments[si.SiteName][attr].Add(currURL, innerDoc.DocumentNode);
                                            SiteLinks[si.SiteName][attr].Add(currURL);
                                            if (--leftPages <= 0)
                                            {
                                                break;
                                            }
                                        }
                                        catch
                                        {
                                            break;
                                        }
                                    }
                                }
                                if (leftPages <= 0)
                                {
                                    break;
                                }
                                //get next page
                                String nextLink = null;
                                try
                                {
                                    nextLink = HttpUtility.HtmlDecode(doc.DocumentNode.SelectSingleNode(si.NextPageXPath).Attributes["href"].Value);
                                }
                                catch { nextLink = null; }

                                if (nextLink != null)
                                {
                                    nextLink = URLDownloader.UrlFixIfRelative(nextLink, currURL);
                                }
                                if (!currURL.ToLower().Trim().Equals(nextLink.ToLower().Trim()))
                                {
                                    currURL = nextLink;
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                        catch { }
                    }

                    foreach (String attr in si.attributeExtraction.Keys)
                    {
                        var trainingkeys = new  HashSet <String>(SiteLinks[si.SiteName][attr].Take(5)); //new HashSet<String>(SiteDocuments[si.SiteName][attr].Keys.Take(5));
                        var trainingDic  = new Dictionary <String, HtmlNode>();                         //SiteDocuments[si.SiteName][attr].Where(x => trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value);
                        foreach (String lnk in trainingkeys)
                        {
                            trainingDic.Add(lnk, GetHtmlNode(lnk));
                        }

                        var testDic = SiteDocuments[si.SiteName][attr].Where(x => !trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value);



                        foreach (var lnk in trainingDic.Keys)
                        {
                            HtmlNode adoc = trainingDic[lnk];
                            try
                            {
                                var gt = adoc.SelectNodes(si.attributeExtraction[attr]);
                                if (gt != null)
                                {
                                    foreach (var targetNode in gt)
                                    {
                                        //Console.Write(":");
                                        if (targetNode.Attributes.Contains("userselected"))
                                        {
                                            targetNode.SetAttributeValue("userselected", "yes");
                                        }
                                        else
                                        {
                                            targetNode.Attributes.Add("userselected", "yes");
                                        }
                                    }
                                }
                            }
                            catch { }

                            MD5 md5 = MD5.Create();


                            if (!File.Exists("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html"))
                            {
                                Directory.CreateDirectory("huge/" + si.SiteName + "/training/" + attr);
                                File.WriteAllText("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html", adoc.InnerHtml);
                            }
                        }

                        DomPool.LoadDocuments(trainingDic);
                        //DomPool.LoadTestDocuments();
                        DomPool.Initiate(new HashSet <string>(trainingDic.Keys));
                        DomPool.ExtractAllFeatures();

                        // Run code
                        DecisionNode dn = new DecisionNode();
                        dn.InitialNodeSet   = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes));
                        dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision));
                        dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes);
                        dn.FeatureSet       = new HashSet <Feature>();
                        dn.CalculateEntropy();

                        DecisionTreeLearning.RecursiveTreeImprovement(dn);
                        var xpath             = XpathTools.GenerateAForgivingXpath(dn);
                        var xpathNonForgiving = XpathTools.DecisionTreeToXpath(dn, new HashSet <Feature>(), 1);
                        xpathNonForgiving = "//*" + (xpathNonForgiving.Equals("") ? "" : ("[" + xpathNonForgiving + "]"));

                        XpathAlignment model = new XpathAlignment();
                        model.LearnModel();
                        var alignmentXpath = model.xpath;


                        CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), xpath, si.attributeExtraction[attr], si.SiteName, attr, "ForgivingXP");
                        CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), alignmentXpath, si.attributeExtraction[attr], si.SiteName, attr, "Alignment");
                    }
                }finally{
                }

                SiteDocuments.Remove(si.SiteName);
            }
            Console.ReadLine();
        }
예제 #9
0
        public static HashSet <Feature> GeneralizeAttributeFeatures(HtmlAttribute att)
        {
            if (att.Name.Equals(DomPool.selectionAttribute) || att.Name.Equals(DomPool.optionalSelectionAttribute))
            {
                return(new HashSet <Feature>());
            }
            HashSet <Feature> res = new HashSet <Feature>();
            //BASE att name existance condition
            Feature f = new Feature();

            f.type    = Feature.FeatureType.Selector;
            f.feature = new List <string>()
            {
                "@" + att.Name
            };
            f.cost = FeatureCosts.ATT_BASE;
            res.Add(f);

            if (att.Value.Trim().Length >= 1)
            {
                f         = new Feature();
                f.type    = Feature.FeatureType.Selector;
                f.feature = new List <string>()
                {
                    "@" + att.Name + "='" + XpathTools.EscapeString(att.Value.Trim()) + "'"
                };
                f.cost = FeatureCosts.ATT_EQUAL;
                res.Add(f);

                f         = new Feature();
                f.type    = Feature.FeatureType.Selector;
                f.feature = new List <string>()
                {
                    "@*='" + XpathTools.EscapeString(att.Value.Trim()) + "'"
                };
                f.cost = FeatureCosts.ATT_ANY_EQUAL;
                res.Add(f);
            }

            Regex reg = new Regex("(?=([A-Z\\-\\s/\\?\\=_]))");

            string[] splitted = reg.Split(XpathTools.EscapeString(att.Value));
            //fix, it must keep the letters and remove the symboles.
            foreach (var split in splitted)
            {
                string curr = split.Replace("-", "").Trim();
                if (curr.Length < 2)
                {
                    continue;
                }
                Feature f_contains = new Feature();
                f_contains.type    = Feature.FeatureType.Selector;
                f_contains.feature = new List <string>()
                {
                    "@" + att.Name + "[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_contains.cost = FeatureCosts.ATT_CONTAINS;
                res.Add(f_contains);

                Feature f_any_contains = new Feature();
                f_any_contains.type    = Feature.FeatureType.Selector;
                f_any_contains.feature = new List <string>()
                {
                    "@*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_any_contains.cost = FeatureCosts.ATT_ANY_CONTAINS;
                res.Add(f_any_contains);
            }

            for (int i = 0; i < splitted.Length - 1; i++)
            {
                var    split = splitted[i] + splitted[i + 1];
                string curr  = split.Replace("-", "").Trim();
                if (curr.Length < 2)
                {
                    continue;
                }
                Feature f_contains = new Feature();
                f_contains.type    = Feature.FeatureType.Selector;
                f_contains.feature = new List <string>()
                {
                    "@" + att.Name + "[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_contains.cost = FeatureCosts.ATT_CONTAINS + 0.05;
                res.Add(f_contains);

                Feature f_any_contains = new Feature();
                f_any_contains.type    = Feature.FeatureType.Selector;
                f_any_contains.feature = new List <string>()
                {
                    "@*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_any_contains.cost = FeatureCosts.ATT_ANY_CONTAINS + 0.05;
                res.Add(f_any_contains);
            }

            return(res);
        }