Пример #1
0
        public ContentExtractionUsingMiBAT(string urlString)
        {
            NonDispBrowser wb = new NonDispBrowser();

            wb.NavigateAndWait(urlString);

            HtmlDocument doc = wb.Document;

            //_Node = doc.Body;
            _Node = doc.GetElementsByTagName("html")[0];

            foreach (HtmlElement node in _Node.All)
            {
                HashSet <HtmlElement> contentNodeSet = MiBAT(node);

                foreach (HtmlElement contentNode in contentNodeSet)
                {
                    ContentNodeSet.Add(contentNode);
                }
            }
        }
        public ContentExtractionUsingLossRatio(string urlString, int thresholdTextLength = 0)
        {
            if (thresholdTextLength > 0)
            {
                ThresholdTextLength = thresholdTextLength;
            }

            NonDispBrowser wb = new NonDispBrowser();

            wb.NavigateAndWait(urlString);

            HtmlDocument doc = wb.Document;

            //_Node = doc.Body;
            _Node = doc.GetElementsByTagName("html")[0];

            CreateNodeProperty(_Node);

            CreateLossRatio(_Node);

            FindContentNode();

            FindSeparator();
        }