public WebBrowserDocument(string url, IHTMLDocument2 msDoc) { _domModifications = new IDomModification[] { new RemoveOrphanElements(), new RemoveUnknownElements(), new RemoveCommentElements(), new RemoveEmptyFreeTextNodes(), new ReplaceFreeTextNodes(), new ReplacePreservedWhitespace(), new ReplaceFreeTextNodes(), }; _elementClassifier = new PreZoningClassification(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden); _zoneTreeBuilder = ZoneTreeBuilder.Create(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden); var defaultStyleLookup = DefaultStyleLookup.CreateForInternetExplorer(); _documentFactory = new MsHtmlDocumentFactory(defaultStyleLookup); _url = url; _msDoc = msDoc; _highlights = new List <HtmlHighlight>(); _domModificationsApplied = false; }
/// <summary> /// Initializes a new instance of the <see cref="AppServices"/> class /// </summary> public AppServices() { var defaultStyleLookup = DefaultStyleLookup.CreateForInternetExplorer(); bool validateSerializationResult = false; _htmlDocumentSerializer = new HtmlDocumentSerializer(defaultStyleLookup, validateSerializationResult); _zoneTreeSerializer = new ZoneTreeSerializer(defaultStyleLookup); _elementClassifier = new PreZoningClassification(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden); _zoneTreeBuilder = ZoneTreeBuilder.Create(HtmlElementType.SignificantBlock, HtmlElementType.SignificantInline, HtmlElementType.SignificantLinebreak, HtmlElementType.SignificantInvisible, HtmlElementType.BreakDown, HtmlElementType.Aname, HtmlElementType.Hidden); _columnTreeBuilder = ColumnTreeBuilder.Create(); var naturalLanguageProcessor = new OpenNaturalLanguageProcessor(); _layoutAnalysisArticleContentLabeler = new ArticleContentLabeler(naturalLanguageProcessor, ZoneLabel.Paragraph, ZoneLabel.ArticleContent, ZoneFeature.Common_Tokens); _articleTagArticleContentLabeler = new SemanticTagArticleContentLabeler(Html.Tags.ARTICLE, ZoneLabel.ArticleContent); _mainTagArticleContentLabeler = new SemanticTagArticleContentLabeler(Html.Tags.MAIN, ZoneLabel.ArticleContent); }
/// <summary> /// Initializes the <see cref="HtmlDocumentSerializer" /> class /// </summary> /// <param name="defaultStyleLookup">The default style lookup</param> /// <param name="validateResult">Whether to validate the serialization result</param> public HtmlDocumentSerializer(DefaultStyleLookup defaultStyleLookup, bool validateResult = false) { _serializer = new XmlSerializer(typeof(SerializableElementList)); _writerSettings = new XmlWriterSettings { Indent = true, CheckCharacters = false, NewLineHandling = NewLineHandling.Entitize, }; _readerSettings = new XmlReaderSettings { CheckCharacters = false, IgnoreWhitespace = true, }; _namespaces = new XmlSerializerNamespaces(new[] { XmlQualifiedName.Empty }); _defaultStyleLookup = defaultStyleLookup; _validateResult = validateResult; }
/// <summary> /// Initializes the <see cref="HtmlDocumentSerializer" /> class /// </summary> public HtmlDocumentSerializer() : this(DefaultStyleLookup.CreateForInternetExplorer()) { }
/// <summary> /// Initializes a new instance of the <see cref="SerializableDocument" /> class /// </summary> /// <param name="root">The root element</param> /// <param name="info">The HTML Document information</param> /// <param name="defaultStyleLookup">The default style lookup</param> /// <param name="html">The html</param> /// <param name="text">The text</param> public SerializableDocument(SerializableElement root, HtmlDocumentInfo info, DefaultStyleLookup defaultStyleLookup, string html, string text) : base(root, info, defaultStyleLookup) { this.Html = html; this.Text = text; }
/// <summary> /// Initializes a new instance of the <see cref="WebBrowser" /> class /// </summary> /// <param name="size">The size</param> /// <param name="defaultStyles">The default styles</param> public WebBrowser(Size size, DefaultStyleLookup defaultStyles) { _size = size; this.DefaultStyles = defaultStyles; this.RenderingComplete = false; }
/// <summary> /// Initializes a new instance of the <see cref="MsHtmlElementFactory"/> class /// </summary> /// <param name="sourceIndexOffset">The source index offset</param> /// <param name="defaultStyleLookup">The default style lookup</param> public MsHtmlElementFactory(int sourceIndexOffset, DefaultStyleLookup defaultStyleLookup) { _sourceIndexOffset = sourceIndexOffset; _defaultStyleLookup = defaultStyleLookup; }
/// <summary> /// Initializes a new instance of the <see cref="SerializableElement" /> class /// </summary> /// <param name="id">The id</param> /// <param name="displayOrder">The display order</param> /// <param name="parentId">The parent id</param> /// <param name="childrenIds">The children ids</param> /// <param name="boundingBox">The bounding box</param> /// <param name="classifications">The classifications</param> /// <param name="tagName">The tag name</param> /// <param name="attributes">The attributes</param> /// <param name="styles">The styles</param> /// <param name="defaultStyleLookup">The default style lookup</param> /// <param name="outerHtmlStartPos">The outer html start position</param> /// <param name="outerHtmlEndPos">The outer html end position</param> /// <param name="innerHtmlStartPos">The inner html start position</param> /// <param name="innerHtmlEndPos">The inner html end position</param> /// <param name="textStartPos">The text start position</param> /// <param name="textEndPos">The text end position</param> public SerializableElement(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, Rectangle boundingBox, IEnumerable <string> classifications, string tagName, IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup defaultStyleLookup, int outerHtmlStartPos, int outerHtmlEndPos, int innerHtmlStartPos, int innerHtmlEndPos, int textStartPos, int textEndPos) { this.Initialize(id, displayOrder, parentId, childrenIds, boundingBox, classifications, tagName, attributes, styles, defaultStyleLookup, outerHtmlStartPos, outerHtmlEndPos, innerHtmlStartPos, innerHtmlEndPos, textStartPos, textEndPos); }
/// <summary> /// Initializes a new instance of the <see cref="MsHtmlDocument" /> class /// </summary> /// <param name="root">The root element</param> /// <param name="info">The Html Document information</param> /// <param name="defaultStyleLookup">The default style lookup</param> public MsHtmlDocument(MsHtmlElement root, HtmlDocumentInfo info, DefaultStyleLookup defaultStyleLookup) : base(root, info, defaultStyleLookup) { }
/// <summary> /// Initializes a new instance of the <see cref="MsHtmlDocumentFactory"/> class /// </summary> /// <param name="defaultStyleLookup">The default style lookup</param> public MsHtmlDocumentFactory(DefaultStyleLookup defaultStyleLookup) { _defaultStyleLookup = defaultStyleLookup; }
/// <summary> /// Initializes a new instance of the <see cref="InternetExplorer" /> class /// </summary> /// <param name="winFormsBrowser">The windows forms browser</param> /// <param name="size">The size</param> /// <param name="defaultStyles">The default styles</param> public InternetExplorer(System.Windows.Forms.WebBrowser winFormsBrowser, Size size, DefaultStyleLookup defaultStyles) : base(size, defaultStyles) { _winFormsBrowser = winFormsBrowser; _msDocFactory = new MsHtmlDocumentFactory(defaultStyles); this.ConfigureBrowser(); }
/// <summary> /// Initializes a new instance of the <see cref="InternetExplorer" /> class /// </summary> /// <param name="winFormsBrowser">The windows forms browser</param> /// <param name="size">The size</param> public InternetExplorer(System.Windows.Forms.WebBrowser winFormsBrowser, Size size) : this(winFormsBrowser, size, DefaultStyleLookup.CreateForInternetExplorer()) { }
/// <summary> /// Initializes a new instance of the <see cref="SerializableElement" /> class /// </summary> /// <param name="element">The html element</param> /// <param name="outerHtmlStartPos">The outer html start position</param> /// <param name="outerHtmlEndPos">The outer html end position</param> /// <param name="innerHtmlStartPos">The inner html start position</param> /// <param name="innerHtmlEndPos">The inner html end position</param> /// <param name="textStartPos">The text start position</param> /// <param name="textEndPos">The text end position</param> /// <param name="defaultStyleLookup">The default style lookup</param> public SerializableElement(HtmlElement element, int outerHtmlStartPos, int outerHtmlEndPos, int innerHtmlStartPos, int innerHtmlEndPos, int textStartPos, int textEndPos, DefaultStyleLookup defaultStyleLookup) { this.Initialize(element, outerHtmlStartPos, outerHtmlEndPos, innerHtmlStartPos, innerHtmlEndPos, textStartPos, textEndPos, defaultStyleLookup); }
/// <summary> /// Initializes the serializable element /// </summary> /// <param name="element">The html element</param> /// <param name="outerHtmlStartPos">The outer html start position</param> /// <param name="outerHtmlEndPos">The outer html end position</param> /// <param name="innerHtmlStartPos">The inner html start position</param> /// <param name="innerHtmlEndPos">The inner html end position</param> /// <param name="textStartPos">The text start position</param> /// <param name="textEndPos">The text end position</param> /// <param name="defaultStyleLookup">The default style lookup</param> public void Initialize(HtmlElement element, int outerHtmlStartPos, int outerHtmlEndPos, int innerHtmlStartPos, int innerHtmlEndPos, int textStartPos, int textEndPos, DefaultStyleLookup defaultStyleLookup) { var attributeDictionary = ConstructAttributeDictionary(element); var styleDictionary = ConstructStyleDictionary(element); this.Initialize(element.Id, element.DisplayOrder, element.ParentId, element.ChildrenIds, element.BoundingBox, element.Classifications, element.TagName, attributeDictionary, styleDictionary, defaultStyleLookup, outerHtmlStartPos, outerHtmlEndPos, innerHtmlStartPos, innerHtmlEndPos, textStartPos, textEndPos); }
/// <summary> /// Sets the default style lookup /// </summary> /// <param name="defaultStyleLookup">The default style lookup</param> public new void SetDefaultStyleLookup(DefaultStyleLookup defaultStyleLookup) { base.SetDefaultStyleLookup(defaultStyleLookup); }
/// <summary> /// Initializes a new instance of the <see cref="SerializableElement" /> class /// For testing purposes /// </summary> /// <param name="id">The id</param> /// <param name="displayOrder">The display order</param> /// <param name="parentId">The parent id</param> /// <param name="childrenIds">The children ids</param> /// <param name="boundingBox">The bounding box</param> /// <param name="classifications">The classifications</param> /// <param name="tagName">Tag name</param> /// <param name="outerHtml">The outer html</param> /// <param name="innerHtml">The inner html</param> /// <param name="outerText">The outer text</param> /// <param name="attributes">The attributes</param> /// <param name="styles">The styles</param> /// <param name="defaultStyleLookup">The default style lookup</param> public SerializableElement(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, Rectangle boundingBox, IEnumerable <string> classifications, string tagName, string outerHtml, string innerHtml, string outerText, IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup defaultStyleLookup) { base.Initialize(id, displayOrder, parentId, childrenIds, boundingBox, classifications, tagName, outerHtml, innerHtml, outerText, attributes, styles, defaultStyleLookup); }
/// <summary> /// Initializes a new instance of the <see cref="HtmlElement" /> class /// </summary> /// <param name="id">The id</param> /// <param name="displayOrder">The display order</param> /// <param name="parentId">The parent id</param> /// <param name="childrenIds">The children ids</param> /// <param name="classifications">The classifications</param> /// <param name="tagName">The tag name</param> /// <param name="outerHtml">The outer html</param> /// <param name="innerHtml">The inner html</param> /// <param name="text">The text</param> /// <param name="attributes">The attributes</param> /// <param name="styles">The styles</param> /// <param name="defaultStyleLookup">The default style lookup</param> /// <param name="offsetParentId">The offset parent id</param> /// <param name="offsetRectangle">The offset rectangle</param> public MsHtmlElement(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, IEnumerable <string> classifications, string tagName, string outerHtml, string innerHtml, string text, IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup defaultStyleLookup, int?offsetParentId, Rectangle offsetRectangle) : base(id, id, parentId, childrenIds, Rectangle.Empty, classifications, tagName, outerHtml, innerHtml, text, attributes, styles, defaultStyleLookup) { _offsetParentId = offsetParentId; _offsetRectangle = offsetRectangle; }
/// <summary> /// Creates the serializable elements /// </summary> /// <param name="element">The original element</param> /// <param name="sElements">All serializable element</param> /// <param name="htmlBuilder">The HTML builder</param> /// <param name="textBuilder">The text builder</param> /// <param name="defaultStyleLookup">The default style lookup</param> public void CreateElements(HtmlElement element, List <SerializableElement> sElements, StringBuilder htmlBuilder, StringBuilder textBuilder, DefaultStyleLookup defaultStyleLookup) { // Create element var sElement = new SerializableElement(); sElements.Add(sElement); // The variables need to be set int outerHtmlStartPos; int outerHtmlEndPos; int innerHtmlStartPos; int innerHtmlEndPos; int textStartPos; int textEndPos; // Record outer start positions outerHtmlStartPos = htmlBuilder.Length; // Add start tag & then record inner html start position htmlBuilder.Append(element.StartTag); innerHtmlStartPos = htmlBuilder.Length; // Add start text string startText; string endText; GetElementText(element, out startText, out endText); textBuilder.Append(startText); textStartPos = textBuilder.Length; // Add inner html/text if (element.ChildrenCount > 0) { foreach (HtmlElement child in element.Children) { CreateElements(child, sElements, htmlBuilder, textBuilder, defaultStyleLookup); } } else { htmlBuilder.Append(element.InnerHtml); textBuilder.Append(element.OuterText); } // Add end tag innerHtmlEndPos = htmlBuilder.Length; htmlBuilder.Append(element.EndTag); outerHtmlEndPos = htmlBuilder.Length; // Add end text textEndPos = textBuilder.Length; textBuilder.Append(endText); // Initalize element sElement.Initialize(element, outerHtmlStartPos, outerHtmlEndPos, innerHtmlStartPos, innerHtmlEndPos, textStartPos, textEndPos, defaultStyleLookup); }
/// <summary> /// Initializes a new instance of the <see cref="MsHtmlDocument" /> class /// </summary> /// <param name="root">The root element</param> /// <param name="info">The Html Document information</param> /// <param name="defaultStyleLookup">The default style lookup</param> /// <param name="displayHtml">The display HTML</param> public MsHtmlDocument(MsHtmlElement root, HtmlDocumentInfo info, DefaultStyleLookup defaultStyleLookup, string displayHtml) : base(root, info, defaultStyleLookup) { this.DisplayHtml = displayHtml; }
/// <summary> /// Initializes a new instance of the <see cref="DisplayHiddenFilter"/> class /// </summary> /// <param name="defaultStyleLookup">The default style lookup</param> public DisplayHiddenFilter(DefaultStyleLookup defaultStyleLookup) { _defaultStyleLookup = defaultStyleLookup; }
/// <summary> /// Initializes a new instance of the <see cref="HtmlRenderer"/> class /// </summary> /// <param name="defaultStyles">The default styles</param> public HtmlRenderer(DefaultStyleLookup defaultStyles) { this.DefaultStyles = defaultStyles; }
/// <summary> /// Initializes the serializable element /// </summary> /// <param name="id">The id</param> /// <param name="displayOrder">The display order</param> /// <param name="parentId">The parent id</param> /// <param name="childrenIds">The children ids</param> /// <param name="boundingBox">The bounding box</param> /// <param name="classifications">The classifications</param> /// <param name="tagName">The tag name</param> /// <param name="attributes">The attributes</param> /// <param name="styles">The styles</param> /// <param name="deafultStyleLookup">The deafult style lookup</param> /// <param name="outerHtmlStartPos">The outer html start position</param> /// <param name="outerHtmlEndPos">The outer html end position</param> /// <param name="innerHtmlStartPos">The inner html start position</param> /// <param name="innerHtmlEndPos">The inner html end position</param> /// <param name="textStartPos">The text start position</param> /// <param name="textEndPos">The text end position</param> public void Initialize(int id, int displayOrder, int?parentId, IReadOnlyList <int> childrenIds, Rectangle boundingBox, IEnumerable <string> classifications, string tagName, IDictionary <string, string> attributes, IDictionary <string, string> styles, DefaultStyleLookup deafultStyleLookup, int outerHtmlStartPos, int outerHtmlEndPos, int innerHtmlStartPos, int innerHtmlEndPos, int textStartPos, int textEndPos) { base.Initialize(id, displayOrder, parentId, childrenIds, boundingBox, classifications, tagName, string.Empty, string.Empty, string.Empty, attributes, styles, deafultStyleLookup); this.OuterHtmlStartPos = outerHtmlStartPos; this.OuterHtmlEndPos = outerHtmlEndPos; this.InnerHtmlStartPos = innerHtmlStartPos; this.InnerHtmlEndPos = innerHtmlEndPos; this.TextStartPos = textStartPos; this.TextEndPos = textEndPos; }