public void StartElement(HtmlNode node) { labelStacks.AddItem(null); TagAction ta = tagActions.Get(node.Name); if (ta != null) { if (ta.ChangesTagLevel()) { tagLevel++; } flush = ta.Start(this, node.Name, node.Attributes) | flush; } else { tagLevel++; flush = true; } lastEvent = NBoilerpipeContentHandler.Event.START_TAG; lastStartTag = node.Name; }
public void EndElement(HtmlNode node) { TagAction ta = tagActions.Get(node.Name); if (ta != null) { flush = ta.End(this, node.Name) | flush; } else { flush = true; } if (ta == null || ta.ChangesTagLevel()) { tagLevel--; } if (flush) { FlushBlock(); } lastEvent = NBoilerpipeContentHandler.Event.END_TAG; lastEndTag = node.Name; labelStacks.RemoveLast(); }
public void HandleText(HtmlTextNode node) { if (IsTag(node.Text)) { node.Text = ""; } char[] ch = HttpUtility.HtmlDecode(node.Text).ToCharArray(); int start = 0; int length = ch.Length; textElementIdx++; if (flush) { FlushBlock(); flush = false; } if (inIgnorableElement != 0) { return; } char c; bool startWhitespace = false; bool endWhitespace = false; if (length == 0) { return; } int end = start + length; for (int i = start; i < end; i++) { if (IsWhiteSpace(ch [i])) { ch [i] = ' '; } } while (start < end) { c = ch [start]; if (c == ' ') { startWhitespace = true; start++; length--; } else { break; } } while (length > 0) { c = ch [start + length - 1]; if (c == ' ') { endWhitespace = true; length--; } else { break; } } if (length == 0) { if (startWhitespace || endWhitespace) { if (!sbLastWasWhitespace) { textBuilder.Append(' '); tokenBuilder.Append(' '); } sbLastWasWhitespace = true; } else { sbLastWasWhitespace = false; } lastEvent = NBoilerpipeContentHandler.Event.WHITESPACE; return; } if (startWhitespace) { if (!sbLastWasWhitespace) { textBuilder.Append(' '); tokenBuilder.Append(' '); } } if (blockTagLevel == -1) { blockTagLevel = tagLevel; } textBuilder.Append(ch, start, length); tokenBuilder.Append(ch, start, length); if (endWhitespace) { textBuilder.Append(' '); tokenBuilder.Append(' '); } sbLastWasWhitespace = endWhitespace; lastEvent = NBoilerpipeContentHandler.Event.CHARACTERS; currentContainedTextElements.Add(textElementIdx); }
public void EndElement(HtmlNode node) { TagAction ta = tagActions.Get (node.Name); if (ta != null) { flush = ta.End (this, node.Name) | flush; } else { flush = true; } if (ta == null || ta.ChangesTagLevel ()) { tagLevel--; } if (flush) { FlushBlock (); } lastEvent = NBoilerpipeContentHandler.Event.END_TAG; lastEndTag = node.Name; labelStacks.RemoveLast (); }
public void StartElement(HtmlNode node) { labelStacks.AddItem (null); TagAction ta = tagActions.Get (node.Name); if (ta != null) { if (ta.ChangesTagLevel ()) { tagLevel++; } flush = ta.Start (this, node.Name, node.Attributes) | flush; } else { tagLevel++; flush = true; } lastEvent = NBoilerpipeContentHandler.Event.START_TAG; lastStartTag = node.Name; }
public void HandleText(HtmlTextNode node) { if (IsTag (node.Text)) node.Text = ""; char[] ch = HttpUtility.HtmlDecode (node.Text).ToCharArray (); int start = 0; int length = ch.Length; textElementIdx++; if (flush) { FlushBlock (); flush = false; } if (inIgnorableElement != 0) { return; } char c; bool startWhitespace = false; bool endWhitespace = false; if (length == 0) { return; } int end = start + length; for (int i = start; i < end; i++) { if (IsWhiteSpace (ch [i])) { ch [i] = ' '; } } while (start < end) { c = ch [start]; if (c == ' ') { startWhitespace = true; start++; length--; } else { break; } } while (length > 0) { c = ch [start + length - 1]; if (c == ' ') { endWhitespace = true; length--; } else { break; } } if (length == 0) { if (startWhitespace || endWhitespace) { if (!sbLastWasWhitespace) { textBuilder.Append (' '); tokenBuilder.Append (' '); } sbLastWasWhitespace = true; } else { sbLastWasWhitespace = false; } lastEvent = NBoilerpipeContentHandler.Event.WHITESPACE; return; } if (startWhitespace) { if (!sbLastWasWhitespace) { textBuilder.Append (' '); tokenBuilder.Append (' '); } } if (blockTagLevel == -1) { blockTagLevel = tagLevel; } textBuilder.Append (ch, start, length); tokenBuilder.Append (ch, start, length); if (endWhitespace) { textBuilder.Append (' '); tokenBuilder.Append (' '); } sbLastWasWhitespace = endWhitespace; lastEvent = NBoilerpipeContentHandler.Event.CHARACTERS; currentContainedTextElements.Add (textElementIdx); }