Exemple #1
0
        public static HtmlNode GetHtmlNode(String url)
        {
            var htmlstr           = URLDownloader.GetHtmlOfURL(url);
            HtmlDocument innerDoc = new HtmlDocument();

            innerDoc.LoadHtml(htmlstr);
            return(innerDoc.DocumentNode);
        }
Exemple #2
0
        public static void PorcessSites(List <SiteInfo> siteinfos)
        {
            foreach (SiteInfo si in siteinfos)
            {
                try {
                    int leftPages             = pagesNum * si.attributeExtraction.Keys.Count();
                    List <HtmlNode> sitePages = new List <HtmlNode>(0);
                    if (!SiteDocuments.ContainsKey(si.SiteName))
                    {
                        SiteDocuments[si.SiteName] = new Dictionary <string, Dictionary <String, HtmlNode> >();
                        SiteLinks[si.SiteName]     = new Dictionary <string, HashSet <String> >();
                        foreach (String attrName in si.attributeExtraction.Keys)
                        {
                            SiteDocuments[si.SiteName][attrName] = new Dictionary <string, HtmlNode>();
                            SiteLinks[si.SiteName][attrName]     = new HashSet <string>();
                        }
                    }

                    //Download all URLs
                    foreach (String startURL in si.StartURLs)
                    {
                        try
                        {
                            String currURL = startURL;
                            while (currURL != null)
                            {
                                HtmlDocument doc = new HtmlDocument();
                                var currHTML     = URLDownloader.GetHtmlOfURL(currURL);
                                doc.LoadHtml(currHTML);
                                if (!String.IsNullOrEmpty(si.PageExtractionXpath))
                                {
                                    var links = doc.DocumentNode.SelectNodes(si.PageExtractionXpath);
                                    foreach (HtmlNode lnk in links)
                                    {
                                        try
                                        {
                                            String pageLink = lnk.Attributes["href"].Value;

                                            // var htmlstr = URLDownloader.GetHtmlOfURL(URLDownloader.UrlFixIfRelative(pageLink, currURL));
                                            foreach (string attr in si.attributeExtraction.Keys)
                                            {
                                                var newURL = URLDownloader.UrlFixIfRelative(pageLink, currURL);
                                                if (SiteLinks[si.SiteName][attr].Contains(newURL))
                                                {
                                                    continue;
                                                }
                                                //HtmlDocument innerDoc = new HtmlDocument();
                                                //innerDoc.LoadHtml(htmlstr);
                                                //SiteDocuments[si.SiteName][attr].Add(pageLink, innerDoc.DocumentNode);
                                                SiteLinks[si.SiteName][attr].Add(newURL);
                                                if (--leftPages <= 0)
                                                {
                                                    break;
                                                }
                                            }
                                            if (leftPages <= 0)
                                            {
                                                break;
                                            }
                                        }
                                        catch (Exception e) {
                                            // Console.WriteLine(e.StackTrace);
                                        }
                                    }
                                }
                                else
                                {
                                    foreach (string attr in si.attributeExtraction.Keys)
                                    {
                                        try
                                        {
                                            if (SiteLinks[si.SiteName][attr].Contains(currURL))
                                            {
                                                continue;
                                            }
                                            // HtmlDocument innerDoc = new HtmlDocument();
                                            // innerDoc.LoadHtml(currHTML);
                                            // SiteDocuments[si.SiteName][attr].Add(currURL, innerDoc.DocumentNode);
                                            SiteLinks[si.SiteName][attr].Add(currURL);
                                            if (--leftPages <= 0)
                                            {
                                                break;
                                            }
                                        }
                                        catch
                                        {
                                            break;
                                        }
                                    }
                                }
                                if (leftPages <= 0)
                                {
                                    break;
                                }
                                //get next page
                                String nextLink = null;
                                try
                                {
                                    nextLink = HttpUtility.HtmlDecode(doc.DocumentNode.SelectSingleNode(si.NextPageXPath).Attributes["href"].Value);
                                }
                                catch { nextLink = null; }

                                if (nextLink != null)
                                {
                                    nextLink = URLDownloader.UrlFixIfRelative(nextLink, currURL);
                                }
                                if (!currURL.ToLower().Trim().Equals(nextLink.ToLower().Trim()))
                                {
                                    currURL = nextLink;
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                        catch { }
                    }

                    foreach (String attr in si.attributeExtraction.Keys)
                    {
                        var trainingkeys = new  HashSet <String>(SiteLinks[si.SiteName][attr].Take(5)); //new HashSet<String>(SiteDocuments[si.SiteName][attr].Keys.Take(5));
                        var trainingDic  = new Dictionary <String, HtmlNode>();                         //SiteDocuments[si.SiteName][attr].Where(x => trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value);
                        foreach (String lnk in trainingkeys)
                        {
                            trainingDic.Add(lnk, GetHtmlNode(lnk));
                        }

                        var testDic = SiteDocuments[si.SiteName][attr].Where(x => !trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value);



                        foreach (var lnk in trainingDic.Keys)
                        {
                            HtmlNode adoc = trainingDic[lnk];
                            try
                            {
                                var gt = adoc.SelectNodes(si.attributeExtraction[attr]);
                                if (gt != null)
                                {
                                    foreach (var targetNode in gt)
                                    {
                                        //Console.Write(":");
                                        if (targetNode.Attributes.Contains("userselected"))
                                        {
                                            targetNode.SetAttributeValue("userselected", "yes");
                                        }
                                        else
                                        {
                                            targetNode.Attributes.Add("userselected", "yes");
                                        }
                                    }
                                }
                            }
                            catch { }

                            MD5 md5 = MD5.Create();


                            if (!File.Exists("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html"))
                            {
                                Directory.CreateDirectory("huge/" + si.SiteName + "/training/" + attr);
                                File.WriteAllText("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html", adoc.InnerHtml);
                            }
                        }

                        DomPool.LoadDocuments(trainingDic);
                        //DomPool.LoadTestDocuments();
                        DomPool.Initiate(new HashSet <string>(trainingDic.Keys));
                        DomPool.ExtractAllFeatures();

                        // Run code
                        DecisionNode dn = new DecisionNode();
                        dn.InitialNodeSet   = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes));
                        dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision));
                        dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes);
                        dn.FeatureSet       = new HashSet <Feature>();
                        dn.CalculateEntropy();

                        DecisionTreeLearning.RecursiveTreeImprovement(dn);
                        var xpath             = XpathTools.GenerateAForgivingXpath(dn);
                        var xpathNonForgiving = XpathTools.DecisionTreeToXpath(dn, new HashSet <Feature>(), 1);
                        xpathNonForgiving = "//*" + (xpathNonForgiving.Equals("") ? "" : ("[" + xpathNonForgiving + "]"));

                        XpathAlignment model = new XpathAlignment();
                        model.LearnModel();
                        var alignmentXpath = model.xpath;


                        CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), xpath, si.attributeExtraction[attr], si.SiteName, attr, "ForgivingXP");
                        CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), alignmentXpath, si.attributeExtraction[attr], si.SiteName, attr, "Alignment");
                    }
                }finally{
                }

                SiteDocuments.Remove(si.SiteName);
            }
            Console.ReadLine();
        }