Esempio n. 1
0
        public WebBrowserDocument(string url, IHTMLDocument2 msDoc)
        {
            _domModifications = new IDomModification[]
            {
                new RemoveOrphanElements(),
                new RemoveUnknownElements(),
                new RemoveCommentElements(),
                new RemoveEmptyFreeTextNodes(),
                new ReplaceFreeTextNodes(),
                new ReplacePreservedWhitespace(),
                new ReplaceFreeTextNodes(),
            };

            _elementClassifier = new PreZoningClassification(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            _zoneTreeBuilder   = ZoneTreeBuilder.Create(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            var defaultStyleLookup = DefaultStyleLookup.CreateForInternetExplorer();

            _documentFactory = new MsHtmlDocumentFactory(defaultStyleLookup);

            _url        = url;
            _msDoc      = msDoc;
            _highlights = new List <HtmlHighlight>();

            _domModificationsApplied = false;
        }
Esempio n. 2
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AppServices"/> class
        /// </summary>
        public AppServices()
        {
            var  defaultStyleLookup          = DefaultStyleLookup.CreateForInternetExplorer();
            bool validateSerializationResult = false;

            _htmlDocumentSerializer = new HtmlDocumentSerializer(defaultStyleLookup, validateSerializationResult);
            _zoneTreeSerializer     = new ZoneTreeSerializer(defaultStyleLookup);

            _elementClassifier = new PreZoningClassification(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            _zoneTreeBuilder   = ZoneTreeBuilder.Create(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden);
            _columnTreeBuilder = ColumnTreeBuilder.Create();

            var naturalLanguageProcessor = new OpenNaturalLanguageProcessor();

            _layoutAnalysisArticleContentLabeler = new ArticleContentLabeler(naturalLanguageProcessor, ZoneLabel.Paragraph, ZoneLabel.ArticleContent, ZoneFeature.Common_Tokens);
            _articleTagArticleContentLabeler     = new SemanticTagArticleContentLabeler(Html.Tags.ARTICLE, ZoneLabel.ArticleContent);
            _mainTagArticleContentLabeler        = new SemanticTagArticleContentLabeler(Html.Tags.MAIN, ZoneLabel.ArticleContent);
        }
        /// <summary>
        /// Initializes the <see cref="HtmlDocumentSerializer" /> class
        /// </summary>
        /// <param name="defaultStyleLookup">The default style lookup</param>
        /// <param name="validateResult">Whether to validate the serialization result</param>
        public HtmlDocumentSerializer(DefaultStyleLookup defaultStyleLookup, bool validateResult = false)
        {
            _serializer     = new XmlSerializer(typeof(SerializableElementList));
            _writerSettings = new XmlWriterSettings
            {
                Indent          = true,
                CheckCharacters = false,
                NewLineHandling = NewLineHandling.Entitize,
            };
            _readerSettings = new XmlReaderSettings
            {
                CheckCharacters  = false,
                IgnoreWhitespace = true,
            };
            _namespaces = new XmlSerializerNamespaces(new[] { XmlQualifiedName.Empty });

            _defaultStyleLookup = defaultStyleLookup;
            _validateResult     = validateResult;
        }
 /// <summary>
 /// Initializes the <see cref="HtmlDocumentSerializer" /> class
 /// </summary>
 public HtmlDocumentSerializer()
     : this(DefaultStyleLookup.CreateForInternetExplorer())
 {
 }
Esempio n. 5
0
 /// <summary>
 /// Initializes a new instance of the <see cref="SerializableDocument" /> class
 /// </summary>
 /// <param name="root">The root element</param>
 /// <param name="info">The HTML Document information</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 /// <param name="html">The html</param>
 /// <param name="text">The text</param>
 public SerializableDocument(SerializableElement root, HtmlDocumentInfo info, DefaultStyleLookup defaultStyleLookup, string html, string text)
     : base(root, info, defaultStyleLookup)
 {
     this.Html = html;
     this.Text = text;
 }
Esempio n. 6
0
 /// <summary>
 /// Initializes a new instance of the <see cref="WebBrowser" /> class
 /// </summary>
 /// <param name="size">The size</param>
 /// <param name="defaultStyles">The default styles</param>
 public WebBrowser(Size size, DefaultStyleLookup defaultStyles)
 {
     _size = size;
     this.DefaultStyles     = defaultStyles;
     this.RenderingComplete = false;
 }
Esempio n. 7
0
 /// <summary>
 /// Initializes a new instance of the <see cref="MsHtmlElementFactory"/> class
 /// </summary>
 /// <param name="sourceIndexOffset">The source index offset</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 public MsHtmlElementFactory(int sourceIndexOffset, DefaultStyleLookup defaultStyleLookup)
 {
     _sourceIndexOffset  = sourceIndexOffset;
     _defaultStyleLookup = defaultStyleLookup;
 }
Esempio n. 8
0
 /// <summary>
 /// Initializes a new instance of the <see cref="SerializableElement" /> class
 /// </summary>
 /// <param name="id">The id</param>
 /// <param name="displayOrder">The display order</param>
 /// <param name="parentId">The parent id</param>
 /// <param name="childrenIds">The children ids</param>
 /// <param name="boundingBox">The bounding box</param>
 /// <param name="classifications">The classifications</param>
 /// <param name="tagName">The tag name</param>
 /// <param name="attributes">The attributes</param>
 /// <param name="styles">The styles</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 /// <param name="outerHtmlStartPos">The outer html start position</param>
 /// <param name="outerHtmlEndPos">The outer html end position</param>
 /// <param name="innerHtmlStartPos">The inner html start position</param>
 /// <param name="innerHtmlEndPos">The inner html end position</param>
 /// <param name="textStartPos">The text start position</param>
 /// <param name="textEndPos">The text end position</param>
 public SerializableElement(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, Rectangle boundingBox, IEnumerable <string> classifications, string tagName,
                            IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup defaultStyleLookup,
                            int outerHtmlStartPos, int outerHtmlEndPos,
                            int innerHtmlStartPos, int innerHtmlEndPos,
                            int textStartPos, int textEndPos)
 {
     this.Initialize(id, displayOrder, parentId, childrenIds, boundingBox, classifications,
                     tagName, attributes, styles, defaultStyleLookup,
                     outerHtmlStartPos, outerHtmlEndPos,
                     innerHtmlStartPos, innerHtmlEndPos,
                     textStartPos, textEndPos);
 }
Esempio n. 9
0
 /// <summary>
 /// Initializes a new instance of the <see cref="MsHtmlDocument" /> class
 /// </summary>
 /// <param name="root">The root element</param>
 /// <param name="info">The Html Document information</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 public MsHtmlDocument(MsHtmlElement root, HtmlDocumentInfo info, DefaultStyleLookup defaultStyleLookup)
     : base(root, info, defaultStyleLookup)
 {
 }
Esempio n. 10
0
 /// <summary>
 /// Initializes a new instance of the <see cref="MsHtmlDocumentFactory"/> class
 /// </summary>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 public MsHtmlDocumentFactory(DefaultStyleLookup defaultStyleLookup)
 {
     _defaultStyleLookup = defaultStyleLookup;
 }
Esempio n. 11
0
 /// <summary>
 /// Initializes a new instance of the <see cref="InternetExplorer" /> class
 /// </summary>
 /// <param name="winFormsBrowser">The windows forms browser</param>
 /// <param name="size">The size</param>
 /// <param name="defaultStyles">The default styles</param>
 public InternetExplorer(System.Windows.Forms.WebBrowser winFormsBrowser, Size size, DefaultStyleLookup defaultStyles)
     : base(size, defaultStyles)
 {
     _winFormsBrowser = winFormsBrowser;
     _msDocFactory    = new MsHtmlDocumentFactory(defaultStyles);
     this.ConfigureBrowser();
 }
Esempio n. 12
0
 /// <summary>
 /// Initializes a new instance of the <see cref="InternetExplorer" /> class
 /// </summary>
 /// <param name="winFormsBrowser">The windows forms browser</param>
 /// <param name="size">The size</param>
 public InternetExplorer(System.Windows.Forms.WebBrowser winFormsBrowser, Size size)
     : this(winFormsBrowser, size, DefaultStyleLookup.CreateForInternetExplorer())
 {
 }
Esempio n. 13
0
 /// <summary>
 /// Initializes a new instance of the <see cref="SerializableElement" /> class
 /// </summary>
 /// <param name="element">The html element</param>
 /// <param name="outerHtmlStartPos">The outer html start position</param>
 /// <param name="outerHtmlEndPos">The outer html end position</param>
 /// <param name="innerHtmlStartPos">The inner html start position</param>
 /// <param name="innerHtmlEndPos">The inner html end position</param>
 /// <param name="textStartPos">The text start position</param>
 /// <param name="textEndPos">The text end position</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 public SerializableElement(HtmlElement element, int outerHtmlStartPos, int outerHtmlEndPos, int innerHtmlStartPos, int innerHtmlEndPos, int textStartPos, int textEndPos, DefaultStyleLookup defaultStyleLookup)
 {
     this.Initialize(element, outerHtmlStartPos, outerHtmlEndPos, innerHtmlStartPos, innerHtmlEndPos, textStartPos, textEndPos, defaultStyleLookup);
 }
Esempio n. 14
0
        /// <summary>
        /// Initializes the serializable element
        /// </summary>
        /// <param name="element">The html element</param>
        /// <param name="outerHtmlStartPos">The outer html start position</param>
        /// <param name="outerHtmlEndPos">The outer html end position</param>
        /// <param name="innerHtmlStartPos">The inner html start position</param>
        /// <param name="innerHtmlEndPos">The inner html end position</param>
        /// <param name="textStartPos">The text start position</param>
        /// <param name="textEndPos">The text end position</param>
        /// <param name="defaultStyleLookup">The default style lookup</param>
        public void Initialize(HtmlElement element, int outerHtmlStartPos, int outerHtmlEndPos, int innerHtmlStartPos, int innerHtmlEndPos, int textStartPos, int textEndPos, DefaultStyleLookup defaultStyleLookup)
        {
            var attributeDictionary = ConstructAttributeDictionary(element);
            var styleDictionary     = ConstructStyleDictionary(element);

            this.Initialize(element.Id, element.DisplayOrder, element.ParentId, element.ChildrenIds, element.BoundingBox, element.Classifications,
                            element.TagName, attributeDictionary, styleDictionary, defaultStyleLookup,
                            outerHtmlStartPos, outerHtmlEndPos,
                            innerHtmlStartPos, innerHtmlEndPos,
                            textStartPos, textEndPos);
        }
Esempio n. 15
0
 /// <summary>
 /// Sets the default style lookup
 /// </summary>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 public new void SetDefaultStyleLookup(DefaultStyleLookup defaultStyleLookup)
 {
     base.SetDefaultStyleLookup(defaultStyleLookup);
 }
Esempio n. 16
0
 /// <summary>
 /// Initializes a new instance of the <see cref="SerializableElement" /> class
 /// For testing purposes
 /// </summary>
 /// <param name="id">The id</param>
 /// <param name="displayOrder">The display order</param>
 /// <param name="parentId">The parent id</param>
 /// <param name="childrenIds">The children ids</param>
 /// <param name="boundingBox">The bounding box</param>
 /// <param name="classifications">The classifications</param>
 /// <param name="tagName">Tag name</param>
 /// <param name="outerHtml">The outer html</param>
 /// <param name="innerHtml">The inner html</param>
 /// <param name="outerText">The outer text</param>
 /// <param name="attributes">The attributes</param>
 /// <param name="styles">The styles</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 public SerializableElement(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, Rectangle boundingBox, IEnumerable <string> classifications,
                            string tagName, string outerHtml, string innerHtml, string outerText,
                            IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup defaultStyleLookup)
 {
     base.Initialize(id, displayOrder, parentId, childrenIds, boundingBox, classifications, tagName, outerHtml, innerHtml, outerText, attributes, styles, defaultStyleLookup);
 }
Esempio n. 17
0
 /// <summary>
 /// Initializes a new instance of the <see cref="HtmlElement" /> class
 /// </summary>
 /// <param name="id">The id</param>
 /// <param name="displayOrder">The display order</param>
 /// <param name="parentId">The parent id</param>
 /// <param name="childrenIds">The children ids</param>
 /// <param name="classifications">The classifications</param>
 /// <param name="tagName">The tag name</param>
 /// <param name="outerHtml">The outer html</param>
 /// <param name="innerHtml">The inner html</param>
 /// <param name="text">The text</param>
 /// <param name="attributes">The attributes</param>
 /// <param name="styles">The styles</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 /// <param name="offsetParentId">The offset parent id</param>
 /// <param name="offsetRectangle">The offset rectangle</param>
 public MsHtmlElement(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, IEnumerable <string> classifications,
                      string tagName, string outerHtml, string innerHtml, string text,
                      IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup defaultStyleLookup,
                      int?offsetParentId, Rectangle offsetRectangle)
     : base(id, id, parentId, childrenIds, Rectangle.Empty, classifications, tagName, outerHtml, innerHtml, text, attributes, styles, defaultStyleLookup)
 {
     _offsetParentId  = offsetParentId;
     _offsetRectangle = offsetRectangle;
 }
Esempio n. 18
0
        /// <summary>
        /// Creates the serializable elements
        /// </summary>
        /// <param name="element">The original element</param>
        /// <param name="sElements">All serializable element</param>
        /// <param name="htmlBuilder">The HTML builder</param>
        /// <param name="textBuilder">The text builder</param>
        /// <param name="defaultStyleLookup">The default style lookup</param>
        public void CreateElements(HtmlElement element, List <SerializableElement> sElements, StringBuilder htmlBuilder, StringBuilder textBuilder, DefaultStyleLookup defaultStyleLookup)
        {
            // Create element
            var sElement = new SerializableElement();

            sElements.Add(sElement);

            // The variables need to be set
            int outerHtmlStartPos;
            int outerHtmlEndPos;
            int innerHtmlStartPos;
            int innerHtmlEndPos;
            int textStartPos;
            int textEndPos;

            // Record outer start positions
            outerHtmlStartPos = htmlBuilder.Length;

            // Add start tag & then record inner html start position
            htmlBuilder.Append(element.StartTag);
            innerHtmlStartPos = htmlBuilder.Length;

            // Add start text
            string startText;
            string endText;

            GetElementText(element, out startText, out endText);
            textBuilder.Append(startText);
            textStartPos = textBuilder.Length;

            // Add inner html/text
            if (element.ChildrenCount > 0)
            {
                foreach (HtmlElement child in element.Children)
                {
                    CreateElements(child, sElements, htmlBuilder, textBuilder, defaultStyleLookup);
                }
            }
            else
            {
                htmlBuilder.Append(element.InnerHtml);
                textBuilder.Append(element.OuterText);
            }

            // Add end tag
            innerHtmlEndPos = htmlBuilder.Length;
            htmlBuilder.Append(element.EndTag);
            outerHtmlEndPos = htmlBuilder.Length;

            // Add end text
            textEndPos = textBuilder.Length;
            textBuilder.Append(endText);

            // Initalize element
            sElement.Initialize(element, outerHtmlStartPos, outerHtmlEndPos, innerHtmlStartPos, innerHtmlEndPos, textStartPos, textEndPos, defaultStyleLookup);
        }
Esempio n. 19
0
 /// <summary>
 /// Initializes a new instance of the <see cref="MsHtmlDocument" /> class
 /// </summary>
 /// <param name="root">The root element</param>
 /// <param name="info">The Html Document information</param>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 /// <param name="displayHtml">The display HTML</param>
 public MsHtmlDocument(MsHtmlElement root, HtmlDocumentInfo info, DefaultStyleLookup defaultStyleLookup, string displayHtml)
     : base(root, info, defaultStyleLookup)
 {
     this.DisplayHtml = displayHtml;
 }
Esempio n. 20
0
 /// <summary>
 /// Initializes a new instance of the <see cref="DisplayHiddenFilter"/> class
 /// </summary>
 /// <param name="defaultStyleLookup">The default style lookup</param>
 public DisplayHiddenFilter(DefaultStyleLookup defaultStyleLookup)
 {
     _defaultStyleLookup = defaultStyleLookup;
 }
Esempio n. 21
0
 /// <summary>
 /// Initializes a new instance of the <see cref="HtmlRenderer"/> class
 /// </summary>
 /// <param name="defaultStyles">The default styles</param>
 public HtmlRenderer(DefaultStyleLookup defaultStyles)
 {
     this.DefaultStyles = defaultStyles;
 }
Esempio n. 22
0
        /// <summary>
        /// Initializes the serializable element
        /// </summary>
        /// <param name="id">The id</param>
        /// <param name="displayOrder">The display order</param>
        /// <param name="parentId">The parent id</param>
        /// <param name="childrenIds">The children ids</param>
        /// <param name="boundingBox">The bounding box</param>
        /// <param name="classifications">The classifications</param>
        /// <param name="tagName">The tag name</param>
        /// <param name="attributes">The attributes</param>
        /// <param name="styles">The styles</param>
        /// <param name="deafultStyleLookup">The deafult style lookup</param>
        /// <param name="outerHtmlStartPos">The outer html start position</param>
        /// <param name="outerHtmlEndPos">The outer html end position</param>
        /// <param name="innerHtmlStartPos">The inner html start position</param>
        /// <param name="innerHtmlEndPos">The inner html end position</param>
        /// <param name="textStartPos">The text start position</param>
        /// <param name="textEndPos">The text end position</param>
        public void Initialize(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, Rectangle boundingBox, IEnumerable <string> classifications, string tagName,
                               IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup deafultStyleLookup,
                               int outerHtmlStartPos, int outerHtmlEndPos,
                               int innerHtmlStartPos, int innerHtmlEndPos,
                               int textStartPos, int textEndPos)
        {
            base.Initialize(id, displayOrder, parentId, childrenIds, boundingBox, classifications, tagName, string.Empty, string.Empty, string.Empty, attributes, styles, deafultStyleLookup);

            this.OuterHtmlStartPos = outerHtmlStartPos;
            this.OuterHtmlEndPos   = outerHtmlEndPos;

            this.InnerHtmlStartPos = innerHtmlStartPos;
            this.InnerHtmlEndPos   = innerHtmlEndPos;

            this.TextStartPos = textStartPos;
            this.TextEndPos   = textEndPos;
        }