public ContentExtractionUsingMiBAT(string urlString) { NonDispBrowser wb = new NonDispBrowser(); wb.NavigateAndWait(urlString); HtmlDocument doc = wb.Document; //_Node = doc.Body; _Node = doc.GetElementsByTagName("html")[0]; foreach (HtmlElement node in _Node.All) { HashSet <HtmlElement> contentNodeSet = MiBAT(node); foreach (HtmlElement contentNode in contentNodeSet) { ContentNodeSet.Add(contentNode); } } }
public ContentExtractionUsingLossRatio(string urlString, int thresholdTextLength = 0) { if (thresholdTextLength > 0) { ThresholdTextLength = thresholdTextLength; } NonDispBrowser wb = new NonDispBrowser(); wb.NavigateAndWait(urlString); HtmlDocument doc = wb.Document; //_Node = doc.Body; _Node = doc.GetElementsByTagName("html")[0]; CreateNodeProperty(_Node); CreateLossRatio(_Node); FindContentNode(); FindSeparator(); }