public static HtmlNode GetHtmlNode(String url) { var htmlstr = URLDownloader.GetHtmlOfURL(url); HtmlDocument innerDoc = new HtmlDocument(); innerDoc.LoadHtml(htmlstr); return(innerDoc.DocumentNode); }
public static void PorcessSites(List <SiteInfo> siteinfos) { foreach (SiteInfo si in siteinfos) { try { int leftPages = pagesNum * si.attributeExtraction.Keys.Count(); List <HtmlNode> sitePages = new List <HtmlNode>(0); if (!SiteDocuments.ContainsKey(si.SiteName)) { SiteDocuments[si.SiteName] = new Dictionary <string, Dictionary <String, HtmlNode> >(); SiteLinks[si.SiteName] = new Dictionary <string, HashSet <String> >(); foreach (String attrName in si.attributeExtraction.Keys) { SiteDocuments[si.SiteName][attrName] = new Dictionary <string, HtmlNode>(); SiteLinks[si.SiteName][attrName] = new HashSet <string>(); } } //Download all URLs foreach (String startURL in si.StartURLs) { try { String currURL = startURL; while (currURL != null) { HtmlDocument doc = new HtmlDocument(); var currHTML = URLDownloader.GetHtmlOfURL(currURL); doc.LoadHtml(currHTML); if (!String.IsNullOrEmpty(si.PageExtractionXpath)) { var links = doc.DocumentNode.SelectNodes(si.PageExtractionXpath); foreach (HtmlNode lnk in links) { try { String pageLink = lnk.Attributes["href"].Value; // var htmlstr = URLDownloader.GetHtmlOfURL(URLDownloader.UrlFixIfRelative(pageLink, currURL)); foreach (string attr in si.attributeExtraction.Keys) { var newURL = URLDownloader.UrlFixIfRelative(pageLink, currURL); if (SiteLinks[si.SiteName][attr].Contains(newURL)) { continue; } //HtmlDocument innerDoc = new HtmlDocument(); //innerDoc.LoadHtml(htmlstr); //SiteDocuments[si.SiteName][attr].Add(pageLink, innerDoc.DocumentNode); SiteLinks[si.SiteName][attr].Add(newURL); if (--leftPages <= 0) { break; } } if (leftPages <= 0) { break; } } catch (Exception e) { // Console.WriteLine(e.StackTrace); } } } else { foreach (string attr in si.attributeExtraction.Keys) { try { if (SiteLinks[si.SiteName][attr].Contains(currURL)) { continue; } // HtmlDocument innerDoc = new HtmlDocument(); // innerDoc.LoadHtml(currHTML); // SiteDocuments[si.SiteName][attr].Add(currURL, innerDoc.DocumentNode); SiteLinks[si.SiteName][attr].Add(currURL); if (--leftPages <= 0) { break; } } catch { break; } } } if (leftPages <= 0) { break; } //get next page String nextLink = null; try { nextLink = HttpUtility.HtmlDecode(doc.DocumentNode.SelectSingleNode(si.NextPageXPath).Attributes["href"].Value); } catch { nextLink = null; } if (nextLink != null) { nextLink = URLDownloader.UrlFixIfRelative(nextLink, currURL); } if (!currURL.ToLower().Trim().Equals(nextLink.ToLower().Trim())) { currURL = nextLink; } else { break; } } } catch { } } foreach (String attr in si.attributeExtraction.Keys) { var trainingkeys = new HashSet <String>(SiteLinks[si.SiteName][attr].Take(5)); //new HashSet<String>(SiteDocuments[si.SiteName][attr].Keys.Take(5)); var trainingDic = new Dictionary <String, HtmlNode>(); //SiteDocuments[si.SiteName][attr].Where(x => trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value); foreach (String lnk in trainingkeys) { trainingDic.Add(lnk, GetHtmlNode(lnk)); } var testDic = SiteDocuments[si.SiteName][attr].Where(x => !trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value); foreach (var lnk in trainingDic.Keys) { HtmlNode adoc = trainingDic[lnk]; try { var gt = adoc.SelectNodes(si.attributeExtraction[attr]); if (gt != null) { foreach (var targetNode in gt) { //Console.Write(":"); if (targetNode.Attributes.Contains("userselected")) { targetNode.SetAttributeValue("userselected", "yes"); } else { targetNode.Attributes.Add("userselected", "yes"); } } } } catch { } MD5 md5 = MD5.Create(); if (!File.Exists("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html")) { Directory.CreateDirectory("huge/" + si.SiteName + "/training/" + attr); File.WriteAllText("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html", adoc.InnerHtml); } } DomPool.LoadDocuments(trainingDic); //DomPool.LoadTestDocuments(); DomPool.Initiate(new HashSet <string>(trainingDic.Keys)); DomPool.ExtractAllFeatures(); // Run code DecisionNode dn = new DecisionNode(); dn.InitialNodeSet = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes)); dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision)); dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes); dn.FeatureSet = new HashSet <Feature>(); dn.CalculateEntropy(); DecisionTreeLearning.RecursiveTreeImprovement(dn); var xpath = XpathTools.GenerateAForgivingXpath(dn); var xpathNonForgiving = XpathTools.DecisionTreeToXpath(dn, new HashSet <Feature>(), 1); xpathNonForgiving = "//*" + (xpathNonForgiving.Equals("") ? "" : ("[" + xpathNonForgiving + "]")); XpathAlignment model = new XpathAlignment(); model.LearnModel(); var alignmentXpath = model.xpath; CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), xpath, si.attributeExtraction[attr], si.SiteName, attr, "ForgivingXP"); CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), alignmentXpath, si.attributeExtraction[attr], si.SiteName, attr, "Alignment"); } }finally{ } SiteDocuments.Remove(si.SiteName); } Console.ReadLine(); }