/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { IList<TextBlock> tbs = doc.GetTextBlocks(); if (tbs.Count < 3) { return false; } TextBlock a = tbs[0]; TextBlock b = tbs[1]; TextBlock c; bool hasChanges = false; for (ListIterator<TextBlock> it = tbs.ListIterator(2); it.HasNext(); ) { c = it.Next(); if (!b.IsContent() && a.IsContent() && c.IsContent() && cond.MeetsCondition(b)) { b.SetIsContent(true); hasChanges = true; } a = c; if (!it.HasNext()) { break; } b = it.Next(); } return hasChanges; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; IList<TextBlock> blocks = doc.GetTextBlocks(); IList<TextBlock> blocksNew = new AList<TextBlock>(); foreach (TextBlock tb in blocks) { string text = tb.GetText(); string[] paragraphs = text.Split("[\n\r]+"); if (paragraphs.Length < 2) { blocksNew.AddItem(tb); continue; } bool isContent = tb.IsContent(); ICollection<string> labels = tb.GetLabels(); foreach (string p in paragraphs) { TextBlock tbP = new TextBlock(p); tbP.SetIsContent(isContent); tbP.AddLabels(labels); blocksNew.AddItem(tbP); changes = true; } } if (changes) { blocks.Clear(); Sharpen.Collections.AddAll(blocks, blocksNew); } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); bool changes = false; if (textBlocks.Count < 2) { return false; } TextBlock b1 = textBlocks[0]; for (ListIterator<TextBlock> it = textBlocks.ListIterator(1); it.HasNext(); ) { TextBlock b2 = it.Next(); bool similar = (b1.GetTextDensity() == b2.GetTextDensity()); if (similar) { b1.MergeNext(b2); it.Remove(); changes = true; } else { b1 = b2; } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; int numWords = 0; bool foundEndOfText = false; for (Iterator<TextBlock> it = doc.GetTextBlocks().Iterator(); it.HasNext(); ) { TextBlock block = it.Next(); bool endOfText = block.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT); if (block.IsContent()) { numWords += GetNumFullTextWords(block); } if (endOfText && numWords >= minNumWords) { foundEndOfText = true; } if (foundEndOfText) { changes = true; block.SetIsContent(false); } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { int i = 0; int title = -1; int contentStart = -1; foreach (TextBlock tb in doc.GetTextBlocks()) { if (contentStart == -1 && tb.HasLabel(DefaultLabels.TITLE)) { title = i; contentStart = -1; } if (contentStart == -1 && tb.IsContent()) { contentStart = i; } i++; } if (contentStart <= title || title == -1) { return false; } bool changes = false; foreach (TextBlock tb_1 in doc.GetTextBlocks().SubList(title, contentStart)) { if (tb_1.HasLabel(DefaultLabels.MIGHT_BE_CONTENT)) { changes = tb_1.SetIsContent(true) | changes; } } return changes; }
// public static long timeSpent = 0; /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { bool changes = false; // long t = System.currentTimeMillis(); foreach (TextBlock tb in doc.GetTextBlocks()) { int numWords = tb.GetNumWords(); if (numWords < 15) { string text = tb.GetText().Trim(); int len = text.Length; if (len >= 8) { string textLC = text.ToLower(); if (textLC.StartsWith("comments") || StartsWithNumber(textLC, len, " comments", " users responded in" ) || textLC.StartsWith("© reuters") || textLC.StartsWith("please rate this") || textLC.StartsWith("post a comment") || textLC.Contains("what you think...") || textLC.Contains("add your comment") || textLC.Contains("add comment") || textLC. Contains("reader views") || textLC.Contains("have your say") || textLC.Contains( "reader comments") || textLC.Contains("rätta artikeln") || textLC.Equals("thanks for your comments - this feedback is now closed" )) { tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT); changes = true; } } } } // timeSpent += System.currentTimeMillis() - t; return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks (); bool hasChanges = false; ListIterator<TextBlock> it = textBlocks.ListIterator (); if (!it.HasNext()) { return false; } TextBlock prevBlock = TextBlock.EMPTY_START; TextBlock currentBlock = it.Next(); TextBlock nextBlock = it.HasNext() ? it.Next() : TextBlock.EMPTY_START; hasChanges = this.Classify(prevBlock, currentBlock, nextBlock) | hasChanges; if (nextBlock != TextBlock.EMPTY_START) { while (it.HasNext()) { prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = it.Next(); hasChanges = this.Classify(prevBlock, currentBlock, nextBlock) | hasChanges; } prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = TextBlock.EMPTY_START; hasChanges = this.Classify(prevBlock, currentBlock, nextBlock) | hasChanges; } return hasChanges; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } TextBlock prevBlock = textBlocks[0]; bool changes = false; do { changes = false; for (ListIterator<TextBlock> it = textBlocks.ListIterator(1); it.HasNext(); ) { TextBlock block = it.Next(); if (prevBlock.IsContent() && block.GetLinkDensity() < 0.56 && !block.HasLabel(DefaultLabels .STRICTLY_NOT_CONTENT)) { prevBlock.MergeNext(block); it.Remove(); changes = true; } else { prevBlock = block; } } } while (changes); return true; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; int words = 0; IList<TextBlock> blocks = doc.GetTextBlocks(); if (!blocks.IsEmpty()) { ListIterator<TextBlock> it = blocks.ListIterator<TextBlock>(blocks.Count); TextBlock tb; while (it.HasPrevious()) { tb = it.Previous(); if (tb.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT)) { tb.AddLabel(DefaultLabels.STRICTLY_NOT_CONTENT); tb.RemoveLabel(DefaultLabels.MIGHT_BE_CONTENT); tb.SetIsContent(false); changes = true; } else { if (tb.IsContent()) { words += tb.GetNumWords(); if (words > 200) { break; } } } } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } bool changes = false; TextBlock prevBlock = textBlocks[0]; int offset = 1; for (ListIterator<TextBlock> it = textBlocks.ListIterator(offset); it.HasNext(); ) { TextBlock block = it.Next(); if (EqualLabels(prevBlock.GetLabels(), block.GetLabels())) { prevBlock.MergeNext(block); it.Remove(); changes = true; } else { prevBlock = block; } } return changes; }
/// <summary> /// Computes statistics on a given /// <see cref="TextDocument">TextDocument</see> /// . /// </summary> /// <param name="doc"> /// The /// <see cref="TextDocument">TextDocument</see> /// . /// </param> /// <param name="contentOnly">if true then o</param> public TextDocumentStatistics(TextDocument doc, bool contentOnly) { foreach (TextBlock tb in doc.GetTextBlocks()) { if (contentOnly && !tb.IsContent()) { continue; } numWords += tb.GetNumWords(); numBlocks++; } }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> tbs = doc.GetTextBlocks(); if (tbs.IsEmpty()) { return false; } foreach (TextBlock tb in tbs) { tb.SetIsContent(!tb.IsContent()); } return true; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (!tb.IsContent()) { tb.SetIsContent(true); changes = true; } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { bool ret = TerminatingBlocksFinder.INSTANCE.Process (doc) | new DocumentTitleMatchClassifier (doc.GetTitle ()).Process (doc) | NumWordsRulesClassifier.INSTANCE.Process (doc) | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.Process (doc) | BlockProximityFusion.MAX_DISTANCE_1.Process (doc) | BoilerplateBlockFilter.INSTANCE.Process (doc) | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY.Process (doc) | KeepLargestBlockFilter.INSTANCE.Process (doc) | ExpandTitleToContentFilter.INSTANCE.Process (doc); return ret; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); bool hasChanges = false; for (Iterator<TextBlock> it = textBlocks.Iterator(); it.HasNext(); ) { TextBlock tb = it.Next(); if (!tb.IsContent()) { it.Remove(); hasChanges = true; } } return hasChanges; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (!tb.IsContent()) { continue; } if (tb.GetNumWords() < minWords) { tb.SetIsContent(false); changes = true; } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (tb.IsContent ()) { foreach (string label in labels) { if (tb.HasLabel (label)) { tb.SetIsContent (false); changes = true; goto BLOCK_LOOP_continue; } } BLOCK_LOOP_continue: {} } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (!tb.IsContent()) { continue; } string text = tb.GetText(); Matcher m = PAT_CLAUSE_DELIMITER.Matcher(text); bool found = m.Find(); int start = 0; int end; bool hasClause = false; while (found) { end = m.Start() + 1; hasClause = IsClause(text.SubSequence(start, end)); start = m.End(); if (hasClause) { break; } found = m.Find(); } end = text.Length; // since clauses should *always end* with a delimiter, we normally // don't consider text without one if (acceptClausesWithoutDelimiter) { hasClause |= IsClause(text.SubSequence(start, end)); } if (!hasClause) { tb.SetIsContent(false); changes = true; } } // System.err.println("IS NOT CONTENT: " + text); return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } int max = -1; TextBlock largestBlock = null; int index = 0; foreach (TextBlock tb in textBlocks) { if (!tb.IsContent()) { continue; } int numWords = GetNumFullTextWords(tb); if (numWords > max) { largestBlock = tb; max = numWords; } index++; } if (largestBlock == null) { return false; } foreach (TextBlock tb_1 in textBlocks) { if (tb_1 == largestBlock) { tb_1.SetIsContent(true); } else { tb_1.SetIsContent(false); tb_1.AddLabel(DefaultLabels.MIGHT_BE_CONTENT); } } return true; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { bool changed = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (tb.GetNumWords() > 10) { continue; } string text = tb.GetText(); foreach (Sharpen.Pattern p in PATTERNS_SHORT) { if (p.Matcher(text).Find()) { changed = true; tb.SetIsContent(true); tb.AddLabel(DefaultLabels.ARTICLE_METADATA); } } } return changed; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } bool changes = false; int remaining = textBlocks.Count; TextBlock blockBelow = null; TextBlock block; for (ListIterator<TextBlock> it = textBlocks.ListIterator<TextBlock>(textBlocks.Count); it.HasPrevious (); ) { if (--remaining <= 0) { break; } if (blockBelow == null) { blockBelow = it.Previous(); continue; } block = it.Previous(); ICollection<string> labels = block.GetLabels(); if (labels != null && !labels.IsEmpty()) { foreach (string l in labels) { blockBelow.AddLabel(labelPrefix + l); } changes = true; } blockBelow = block; } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return NumWordsRulesClassifier.INSTANCE.Process(doc) | BlockProximityFusion.MAX_DISTANCE_1 .Process(doc) | KeepLargestBlockFilter.INSTANCE.Process(doc); }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { if (potentialTitles == null) { return false; } bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { string text = tb.GetText().Trim(); foreach (string candidate in potentialTitles) { if (candidate.Equals(text)) { tb.AddLabel(DefaultLabels.TITLE); changes = true; } } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } int maxNumWords = -1; TextBlock largestBlock = null; int level = -1; int i = 0; int n = -1; foreach (TextBlock tb in textBlocks) { if (tb.IsContent()) { int nw = tb.GetNumWords(); if (nw > maxNumWords) { largestBlock = tb; maxNumWords = nw; n = i; if (expandToSameLevelText) { level = tb.GetTagLevel(); } } } i++; } foreach (TextBlock tb_1 in textBlocks) { if (tb_1 == largestBlock) { tb_1.SetIsContent(true); } else { tb_1.SetIsContent(false); tb_1.AddLabel(DefaultLabels.MIGHT_BE_CONTENT); } } if (expandToSameLevelText && n != -1) { for (ListIterator<TextBlock> it = textBlocks.ListIterator(n); it.HasPrevious(); ) { TextBlock tb_2 = it.Previous(); int tl = tb_2.GetTagLevel(); if (tl < level) { break; } else { if (tl == level) { tb_2.SetIsContent(true); } } } for (ListIterator<TextBlock> it_1 = textBlocks.ListIterator(n); it_1.HasNext(); ) { TextBlock tb_2 = it_1.Next(); int tl = tb_2.GetTagLevel(); if (tl < level) { break; } else { if (tl == level) { tb_2.SetIsContent(true); } } } } return true; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return MarkEverythingContentFilter.INSTANCE.Process(doc); }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return ArticleExtractor.INSTANCE.Process(doc) | SplitParagraphBlocksFilter.INSTANCE .Process(doc) | MinClauseWordsFilter.INSTANCE.Process(doc); }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return SimpleBlockFusionProcessor.INSTANCE.Process (doc) | BlockProximityFusion.MAX_DISTANCE_1.Process (doc) | DensityRulesClassifier.INSTANCE.Process (doc); }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } bool changes = false; TextBlock prevBlock; int offset; if (contentOnly) { prevBlock = null; offset = 0; foreach (TextBlock tb in textBlocks) { offset++; if (tb.IsContent()) { prevBlock = tb; break; } } if (prevBlock == null) { return false; } } else { prevBlock = textBlocks[0]; offset = 1; } for (ListIterator<TextBlock> it = textBlocks.ListIterator<TextBlock>(offset); it.HasNext(); ) { TextBlock block = it.Next(); if (!block.IsContent()) { prevBlock = block; continue; } int diffBlocks = block.GetOffsetBlocksStart() - prevBlock.GetOffsetBlocksEnd() - 1; if (diffBlocks <= maxBlocksDistance) { bool ok = true; if (contentOnly) { if (!prevBlock.IsContent() || !block.IsContent()) { ok = false; } } if (ok && sameTagLevelOnly && prevBlock.GetTagLevel() != block.GetTagLevel()) { ok = false; } if (ok) { prevBlock.MergeNext(block); it.Remove(); changes = true; } else { prevBlock = block; } } else { prevBlock = block; } } return changes; }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return NumWordsRulesClassifier.INSTANCE.Process(doc); }
/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return CLASSIFIER.Process(doc); }