/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); bool hasChanges = false; var it = textBlocks.GetEnumerator(); if (!it.MoveNext()) { return false; } TextBlock prevBlock = TextBlock.EMPTY_START; TextBlock currentBlock = it.Current; TextBlock nextBlock = it.MoveNext() ? it.Current : TextBlock.EMPTY_START; hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges; if (nextBlock != TextBlock.EMPTY_START) { while (it.MoveNext()) { prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = it.Current; hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges; } prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = TextBlock.EMPTY_START; hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges; } return hasChanges; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; IList<TextBlock> blocks = doc.GetTextBlocks(); IList<TextBlock> blocksNew = new List<TextBlock>(); foreach (TextBlock tb in blocks) { string text = tb.GetText(); string[] paragraphs = text.Split('[', '\n', '\r', ']', '+'); if (paragraphs.Length < 2) { blocksNew.Add(tb); continue; } bool isContent = tb.IsContent(); ICollection<string> labels = tb.GetLabels(); foreach (string p in paragraphs) { TextBlock tbP = new TextBlock(p); tbP.SetIsContent(isContent); tbP.AddLabels(labels); blocksNew.Add(tbP); changes = true; } } if (changes) { blocks.Clear(); foreach (var block in blocksNew) blocks.Add(block); } return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); bool changes = false; if (textBlocks.Count < 2) { return false; } TextBlock b1 = textBlocks[0]; do { foreach (var b2 in new List<TextBlock>(textBlocks.Skip(1))) { if (b1.IsContent() && b2.GetLinkDensity() < 0.56 && !b2.HasLabel(DefaultLabels .STRICTLY_NOT_CONTENT)) { b1.MergeNext(b2); textBlocks.Remove(b2); changes = true; } else { b1 = b2; } } } while (changes); return true; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { IList<TextBlock> tbs = doc.GetTextBlocks(); if (tbs.Count < 3) { return false; } TextBlock a = tbs[0]; TextBlock b = tbs[1]; TextBlock c; bool hasChanges = false; var it = tbs.Skip(2).GetEnumerator(); it.MoveNext(); for(;;) { c = it.Current; if (!b.IsContent() && a.IsContent() && c.IsContent() && cond.MeetsCondition(b)) { b.SetIsContent(true); hasChanges = true; } a = c; if (!it.MoveNext()) { break; } b = it.Current; } return hasChanges; }
public bool Process(TextDocument doc) { var changes = false; int tagLevel = Int32.MaxValue; foreach (var tb in doc.GetTextBlocks()) { if (tb.IsContent() && tb.HasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) { tagLevel = tb.GetTagLevel(); } else { if (tb.GetTagLevel() > tagLevel && tb.HasLabel(DefaultLabels.MIGHT_BE_CONTENT) && tb.HasLabel(DefaultLabels.LI) && tb.GetLinkDensity() == 0) { tb.SetIsContent(true); changes = true; } else { tagLevel = Int32.MaxValue; } } } return changes; }
public bool Process(TextDocument doc) { var changes = false; int tagLevel = -1; foreach (var tb in doc.GetTextBlocks()) { if (tb.IsContent() && tb.HasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) { tagLevel = tb.GetTagLevel(); break; } if (tagLevel == -1) { return false; } } foreach (var tb in doc.GetTextBlocks()) { if (!tb.IsContent()) { if (tb.GetNumWords() >= 100 && tb.GetTagLevel() == tagLevel) { tb.SetIsContent(true); changes = true; } } } return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); var removeMe = textBlocks.Where(tb => !tb.IsContent() && (labelToKeep == null || !tb.HasLabel(DefaultLabels.TITLE))).ToList(); foreach (var tb in removeMe) { textBlocks.Remove(tb); } return removeMe.Count > 0; }
/// <summary> /// Computes statistics on a given /// <see cref="TextDocument">TextDocument</see> /// . /// </summary> /// <param name="doc"> /// The /// <see cref="TextDocument">TextDocument</see> /// . /// </param> /// <param name="contentOnly">if true then o</param> public TextDocumentStatistics(TextDocument doc, bool contentOnly) { foreach (TextBlock tb in doc.GetTextBlocks()) { if (contentOnly && !tb.IsContent()) { continue; } numWords += tb.GetNumWords(); numBlocks++; } }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (!tb.IsContent()) { tb.SetIsContent(true); changes = true; } } return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process (TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (tb.IsContent ()) { foreach (string label in labels) { if (tb.HasLabel (label)) { tb.SetIsContent (false); changes = true; goto BLOCK_LOOP_continue; } } BLOCK_LOOP_continue: {} } } return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (!tb.IsContent()) { continue; } if (GetNumFullTextWords(tb) < minWords) { tb.SetIsContent(false); changes = true; } } return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public override bool Process (TextDocument doc) { bool ret = TerminatingBlocksFinder.INSTANCE.Process (doc) | new DocumentTitleMatchClassifier (doc.GetTitle ()).Process (doc) | NumWordsRulesClassifier.INSTANCE.Process (doc) | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.Process (doc) | TrailingHeadlineToBoilerplateFilter.INSTANCE.Process(doc) | BlockProximityFusion.MAX_DISTANCE_1.Process (doc) | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.Process (doc) | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.Process (doc) | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.Process (doc) | ExpandTitleToContentFilter.INSTANCE.Process (doc) | LargeBlockSameTagLevelToContentFilter.INSTANCE.Process(doc) | ListAtEndFilter.INSTANCE.Process(doc); return ret; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { bool changed = false; foreach (TextBlock tb in doc.GetTextBlocks()) { if (tb.GetNumWords() > 10) { continue; } string text = tb.GetText(); foreach (Sharpen.Pattern p in PATTERNS_SHORT) { if (p.Matcher(text).Find()) { changed = true; tb.SetIsContent(true); tb.AddLabel(DefaultLabels.ARTICLE_METADATA); } } } return changed; }
// public static long timeSpent = 0; /// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public virtual bool Process(TextDocument doc) { bool changes = false; // long t = System.currentTimeMillis(); foreach (TextBlock tb in doc.GetTextBlocks()) { int numWords = tb.GetNumWords(); if (numWords < 15) { string text = tb.GetText().Trim(); int len = text.Length; if (len >= 8) { string textLC = text.ToLower(); if (textLC.StartsWith("references") || StartsWithNumber(textLC, len, " comments", " users responded in" ) || textLC.StartsWith("© reuters") || textLC.StartsWith("please rate this") || textLC.StartsWith("post a comment") || textLC.Contains("what you think...") || textLC.Contains("add your comment") || textLC.Contains("add comment") || textLC. Contains("reader views") || textLC.Contains("have your say") || textLC.Contains( "reader comments") || textLC.Contains("rätta artikeln") || textLC.Equals("thanks for your comments - this feedback is now closed" )) { tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT); changes = true; } } else if (tb.GetLinkDensity() == 1.0) { if (text == "Comment") { tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT); changes = true; } } } } // timeSpent += System.currentTimeMillis() - t; return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } bool changes = false; int remaining = textBlocks.Count; TextBlock blockBelow = null; TextBlock block; for (var it = textBlocks.Reverse().GetEnumerator(); it.MoveNext(); ) { if (--remaining <= 0) { break; } if (blockBelow == null) { blockBelow = it.Current; continue; } block = it.Current; ICollection<string> labels = block.GetLabels(); if (labels != null && labels.Count != 0) { foreach (string l in labels) { blockBelow.AddLabel(labelPrefix + l); } changes = true; } blockBelow = block; } return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } bool changes = false; TextBlock prevBlock = textBlocks[0]; foreach (var block in new List<TextBlock>(textBlocks.Skip(1))) { if (EqualLabels(prevBlock.GetLabels(), block.GetLabels())) { prevBlock.MergeNext(block); textBlocks.Remove(block); changes = true; } else { prevBlock = block; } } return changes; }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } int maxNumWords = -1; TextBlock largestBlock = null; int level = -1; int i = 0; int n = -1; foreach (TextBlock tb in textBlocks) { if (tb.IsContent()) { int nw = tb.GetNumWords(); if (nw > maxNumWords) { largestBlock = tb; maxNumWords = nw; n = i; if (expandToSameLevelText) { level = tb.GetTagLevel(); } } } i++; } foreach (TextBlock tb in textBlocks) { if (tb == largestBlock) { tb.SetIsContent(true); } else { tb.SetIsContent(false); tb.AddLabel(DefaultLabels.MIGHT_BE_CONTENT); } } if (expandToSameLevelText && n != -1) { foreach (var tb in textBlocks.Take(n).Reverse()) { int tl = tb.GetTagLevel(); if (tl < level) { break; } else { if (tl == level) { if(tb.GetNumWords() >= minWords) tb.SetIsContent(true); } } } foreach (var tb in textBlocks.Skip(n)) { int tl = tb.GetTagLevel(); if (tl < level) { break; } else { if (tl == level) { if (tb.GetNumWords() >= minWords) tb.SetIsContent(true); } } } } return true; }
/// <summary> /// Extracts text from the given /// <see cref="NBoilerpipePortable.Document.TextDocument">NBoilerpipePortable.Document.TextDocument</see> /// object. /// </summary> /// <param name="doc"> /// The /// <see cref="NBoilerpipePortable.Document.TextDocument">NBoilerpipePortable.Document.TextDocument</see> /// . /// </param> /// <returns>The extracted text.</returns> /// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException">NBoilerpipePortable.BoilerpipeProcessingException /// </exception> public virtual string GetText(TextDocument doc) { Process(doc); return doc.GetContent(); }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { if (potentialTitles == null) { return false; } bool changes = false; foreach (TextBlock tb in doc.GetTextBlocks()) { string text = tb.GetText(); text = text.Replace('\u00a0', ' '); text = text.Replace("'", ""); text = text.Trim().ToLower(); foreach (string candidate in potentialTitles) { if (candidate.Equals(text)) { tb.AddLabel(DefaultLabels.TITLE); changes = true; } } } return changes; }
public abstract bool Process(TextDocument arg1);
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return MarkEverythingContentFilter.INSTANCE.Process(doc); }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return NumWordsRulesClassifier.INSTANCE.Process(doc) | BlockProximityFusion.MAX_DISTANCE_1 .Process(doc) | KeepLargestBlockFilter.INSTANCE.Process(doc); }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return CLASSIFIER.Process(doc); }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public override bool Process(TextDocument doc) { return SimpleBlockFusionProcessor.INSTANCE.Process(doc) | MarkEverythingContentFilter .INSTANCE.Process(doc) | filter.Process(doc); }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public override bool Process (TextDocument doc) { return SimpleBlockFusionProcessor.INSTANCE.Process (doc) | BlockProximityFusion.MAX_DISTANCE_1.Process (doc) | DensityRulesClassifier.INSTANCE.Process (doc); }
/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { IList<TextBlock> textBlocks = doc.GetTextBlocks(); if (textBlocks.Count < 2) { return false; } bool changes = false; TextBlock prevBlock; int offset; if (contentOnly) { prevBlock = null; offset = 0; foreach (TextBlock tb in textBlocks) { offset++; if (tb.IsContent()) { prevBlock = tb; break; } } if (prevBlock == null) { return false; } } else { prevBlock = textBlocks[0]; offset = 1; } List<TextBlock> removalList = new List<TextBlock>(); foreach(var block in textBlocks.Skip(offset)) { if (!block.IsContent()) { prevBlock = block; continue; } int diffBlocks = block.GetOffsetBlocksStart() - prevBlock.GetOffsetBlocksEnd() - 1; if (diffBlocks <= maxBlocksDistance) { bool ok = true; if (contentOnly) { if (!prevBlock.IsContent() || !block.IsContent()) { ok = false; } } if (ok && sameTagLevelOnly && prevBlock.GetTagLevel() != block.GetTagLevel()) { ok = false; } if (ok) { prevBlock.MergeNext(block); removalList.Add(block); changes = true; } else { prevBlock = block; } } else { prevBlock = block; } } foreach (var removal in removalList) textBlocks.Remove(removal); return changes; }