/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; IList<TextBlock> blocks = doc.GetTextBlocks(); IList<TextBlock> blocksNew = new List<TextBlock>(); foreach (TextBlock tb in blocks) { string text = tb.GetText(); string[] paragraphs = text.Split('[', '\n', '\r', ']', '+'); if (paragraphs.Length < 2) { blocksNew.Add(tb); continue; } bool isContent = tb.IsContent(); ICollection<string> labels = tb.GetLabels(); foreach (string p in paragraphs) { TextBlock tbP = new TextBlock(p); tbP.SetIsContent(isContent); tbP.AddLabels(labels); blocksNew.Add(tbP); changes = true; } } if (changes) { blocks.Clear(); foreach (var block in blocksNew) blocks.Add(block); } return changes; }
protected internal virtual bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent; if (curr.GetLinkDensity() <= 0.333333) { if (prev.GetLinkDensity() <= 0.555556) { if (curr.GetNumWords() <= 16) { if (next.GetNumWords() <= 15) { if (prev.GetNumWords() <= 4) { isContent = false; } else { isContent = true; } } else { isContent = true; } } else { isContent = true; } } else { if (curr.GetNumWords() <= 40) { if (next.GetNumWords() <= 17) { isContent = false; } else { isContent = true; } } else { isContent = true; } } } else { isContent = false; } return curr.SetIsContent(isContent); }
protected internal static int GetNumFullTextWords(TextBlock tb, float minTextDensity ) { if (tb.GetTextDensity() >= minTextDensity) { return tb.GetNumWords(); } else { return 0; } }
protected internal static int GetNumFullTextWords(TextBlock tb) { return GetNumFullTextWords(tb, 9); }
protected void AddTextBlock (TextBlock tb) { foreach (int l in fontSizeStack) { tb.AddLabels ("font-" + l); break; } foreach (List<LabelAction> labels in labelStacks) { if (labels != null) { foreach (LabelAction label in labels) { label.AddTo (tb); } } } textBlocks.Add (tb); }
public void FlushBlock () { if (inBody == 0) { if (inBody == 0 && string.Compare("TITLE", lastStartTag, StringComparison.CurrentCultureIgnoreCase) == 0) SetTitle (tokenBuilder.ToString ().Trim ()); textBuilder.Length = 0; tokenBuilder.Length = 0; return; } int length = tokenBuilder.Length; if (length == 0) { return; } else if (length == 1) { if (sbLastWasWhitespace) { textBuilder.Length = 0; tokenBuilder.Length = 0; return; } } string[] tokens = UnicodeTokenizer.Tokenize (tokenBuilder); int numWords = 0; int numLinkedWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don't count the first space int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; foreach (string token in tokens) { if (token == ANCHOR_TEXT_START) { inAnchorText = true; } else { if (token == ANCHOR_TEXT_END) { inAnchorText = false; } else { if (IsWord (token)) { numTokens++; numWords++; numWordsCurrentLine++; if (inAnchorText) { numLinkedWords++; } int tokenLength = token.Length; currentLineLength += tokenLength + 1; if (currentLineLength > maxLineLength) { numWrappedLines++; currentLineLength = tokenLength; numWordsCurrentLine = 1; } } else { numTokens++; } } } } if (numTokens == 0) { return; } int numWordsInWrappedLines; if (numWrappedLines == 0) { numWordsInWrappedLines = numWords; numWrappedLines = 1; } else { numWordsInWrappedLines = numWords - numWordsCurrentLine; } TextBlock tb = new TextBlock (textBuilder.ToString ().Trim (), currentContainedTextElements , numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks ); currentContainedTextElements = new BitSet (); offsetBlocks++; textBuilder.Length = 0; tokenBuilder.Length = 0; tb.SetTagLevel (blockTagLevel); AddTextBlock (tb); blockTagLevel = -1; }
protected internal bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent = (curr.GetLinkDensity() > 0 && next.GetNumWords() > 11) || (curr. GetNumWords() > 19 || (next.GetNumWords() > 6 && next.GetLinkDensity() == 0 && prev .GetLinkDensity() == 0 && (curr.GetNumWords() > 6 || prev.GetNumWords() > 7 || next .GetNumWords() > 19))); return curr.SetIsContent(isContent); }
public bool MeetsCondition(TextBlock tb) { return tb.GetLinkDensity() == 0 && tb.GetNumWords() > 6; }
public Object Clone () { TextBlock clone = new TextBlock (text.ToString()); if (labels != null && !labels.IsEmpty ()) { clone.labels = new HashSet<string> (labels); } if (containedTextElements != null) { clone.containedTextElements = (BitSet)containedTextElements.Clone (); } return clone; }