/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; IList<TextBlock> blocks = doc.GetTextBlocks(); IList<TextBlock> blocksNew = new AList<TextBlock>(); foreach (TextBlock tb in blocks) { string text = tb.GetText(); string[] paragraphs = text.Split("[\n\r]+"); if (paragraphs.Length < 2) { blocksNew.AddItem(tb); continue; } bool isContent = tb.IsContent(); ICollection<string> labels = tb.GetLabels(); foreach (string p in paragraphs) { TextBlock tbP = new TextBlock(p); tbP.SetIsContent(isContent); tbP.AddLabels(labels); blocksNew.AddItem(tbP); changes = true; } } if (changes) { blocks.Clear(); Sharpen.Collections.AddAll(blocks, blocksNew); } return changes; }
public override void AddTo(TextBlock tb) { if (condition.MeetsCondition(tb)) { AddLabelsTo(tb); } }
protected internal virtual bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent; if (curr.GetLinkDensity() <= 0.333333) { if (prev.GetLinkDensity() <= 0.555556) { if (curr.GetTextDensity() <= 9) { if (next.GetTextDensity() <= 10) { if (prev.GetTextDensity() <= 4) { isContent = false; } else { isContent = true; } } else { isContent = true; } } else { if (next.GetTextDensity() == 0) { isContent = false; } else { isContent = true; } } } else { if (next.GetTextDensity() <= 11) { isContent = false; } else { isContent = true; } } } else { isContent = false; } return curr.SetIsContent(isContent); }
protected internal static int GetNumFullTextWords(TextBlock tb, float minTextDensity ) { if (tb.GetTextDensity() >= minTextDensity) { return tb.GetNumWords(); } else { return 0; } }
public bool MeetsCondition(TextBlock tb) { return tb.GetLinkDensity() == 0 && tb.GetNumWords() > 6; }
protected internal static int GetNumFullTextWords(TextBlock tb) { return GetNumFullTextWords(tb, 9); }
protected void AddTextBlock(TextBlock tb) { foreach (int l in fontSizeStack) { tb.AddLabels ("font-" + l); break; } foreach (List<LabelAction> labels in labelStacks) { if (labels != null) { foreach (LabelAction label in labels) { label.AddTo (tb); } } } textBlocks.Add (tb); }
public void FlushBlock() { if (inBody == 0) { if (inBody == 0 && Sharpen.Runtime.EqualsIgnoreCase ("TITLE", lastStartTag)) SetTitle (tokenBuilder.ToString ().Trim ()); textBuilder.Length = 0; tokenBuilder.Length = 0; return; } int length = tokenBuilder.Length; if (length == 0) { return; } else if (length == 1) { if (sbLastWasWhitespace) { textBuilder.Length = 0; tokenBuilder.Length = 0; return; } } string[] tokens = UnicodeTokenizer.Tokenize (tokenBuilder); int numWords = 0; int numLinkedWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don't count the first space int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; foreach (string token in tokens) { if (token == ANCHOR_TEXT_START) { inAnchorText = true; } else { if (token == ANCHOR_TEXT_END) { inAnchorText = false; } else { if (IsWord (token)) { numTokens++; numWords++; numWordsCurrentLine++; if (inAnchorText) { numLinkedWords++; } int tokenLength = token.Length; currentLineLength += tokenLength + 1; if (currentLineLength > maxLineLength) { numWrappedLines++; currentLineLength = tokenLength; numWordsCurrentLine = 1; } } else { numTokens++; } } } } if (numTokens == 0) { return; } int numWordsInWrappedLines; if (numWrappedLines == 0) { numWordsInWrappedLines = numWords; numWrappedLines = 1; } else { numWordsInWrappedLines = numWords - numWordsCurrentLine; } TextBlock tb = new TextBlock (textBuilder.ToString ().Trim (), currentContainedTextElements , numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks ); currentContainedTextElements = new BitSet (); offsetBlocks++; textBuilder.Length = 0; tokenBuilder.Length = 0; tb.SetTagLevel (blockTagLevel); AddTextBlock (tb); blockTagLevel = -1; }
protected internal void AddLabelsTo(TextBlock tb) { tb.AddLabels(labels); }
public virtual void AddTo(TextBlock tb) { AddLabelsTo(tb); }
protected internal bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent = (curr.GetLinkDensity() > 0 && next.GetNumWords() > 11) || (curr. GetNumWords() > 19 || (next.GetNumWords() > 6 && next.GetLinkDensity() == 0 && prev .GetLinkDensity() == 0 && (curr.GetNumWords() > 6 || prev.GetNumWords() > 7 || next .GetNumWords() > 19))); return curr.SetIsContent(isContent); }
public Object Clone() { TextBlock clone = new TextBlock (text.ToString()); if (labels != null && !labels.IsEmpty ()) { clone.labels = new HashSet<string> (labels); } if (containedTextElements != null) { clone.containedTextElements = (BitSet)containedTextElements.Clone (); } return clone; }