Exemplo n.º 1
0
        public WebBrowserDocument(string url, IHTMLDocument2 msDoc)
        {
            _domModifications = new IDomModification[]
            {
                new RemoveOrphanElements(),
                new RemoveUnknownElements(),
                new RemoveCommentElements(),
                new RemoveEmptyFreeTextNodes(),
                new ReplaceFreeTextNodes(),
                new ReplacePreservedWhitespace(),
                new ReplaceFreeTextNodes(),
            };

            _elementClassifier = new PreZoningClassification(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            _zoneTreeBuilder   = ZoneTreeBuilder.Create(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            var defaultStyleLookup = DefaultStyleLookup.CreateForInternetExplorer();

            _documentFactory = new MsHtmlDocumentFactory(defaultStyleLookup);

            _url        = url;
            _msDoc      = msDoc;
            _highlights = new List <HtmlHighlight>();

            _domModificationsApplied = false;
        }
Exemplo n.º 2
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AppServices"/> class
        /// </summary>
        public AppServices()
        {
            var  defaultStyleLookup          = DefaultStyleLookup.CreateForInternetExplorer();
            bool validateSerializationResult = false;

            _htmlDocumentSerializer = new HtmlDocumentSerializer(defaultStyleLookup, validateSerializationResult);
            _zoneTreeSerializer     = new ZoneTreeSerializer(defaultStyleLookup);

            _elementClassifier = new PreZoningClassification(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            _zoneTreeBuilder   = ZoneTreeBuilder.Create(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            _columnTreeBuilder = ColumnTreeBuilder.Create();

            var naturalLanguageProcessor = new OpenNaturalLanguageProcessor();

            _layoutAnalysisArticleContentLabeler = new ArticleContentLabeler(naturalLanguageProcessor, ZoneLabel.Paragraph, ZoneLabel.ArticleContent, ZoneFeature.Common_Tokens);
            _articleTagArticleContentLabeler     = new SemanticTagArticleContentLabeler(Html.Tags.ARTICLE, ZoneLabel.ArticleContent);
            _mainTagArticleContentLabeler        = new SemanticTagArticleContentLabeler(Html.Tags.MAIN, ZoneLabel.ArticleContent);
        }