public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; if (textBlocks.Count < 2) { return false; } int max = -1; TextBlock largestBlock = null;; foreach (TextBlock tb in textBlocks) { if (!tb.IsContent) { continue; } int numWords = GetNumFullTextWords(tb); if (numWords > max) { largestBlock = tb; max = numWords; } } if (largestBlock == null) { return false; } foreach (TextBlock tb in textBlocks) { if (tb == largestBlock) { tb.IsContent = true; } else { tb.IsContent = false; tb.AddLabel(DefaultLabels.MIGHT_BE_CONTENT); } } return true; }
public bool Process(TextDocument doc) { int i = 0; int title = -1; int contentStart = -1; foreach (TextBlock tb in doc.TextBlocks) { if (contentStart == -1 && tb.HasLabel(DefaultLabels.TITLE)) { title = i; contentStart = -1; } if (contentStart == -1 && tb.IsContent) { contentStart = i; } i++; } if (contentStart <= title || title == -1) { return false; } bool changes = false; foreach (TextBlock tb in doc.TextBlocks.Skip(title).Take(contentStart-title)) { if (tb.HasLabel(DefaultLabels.MIGHT_BE_CONTENT)) { bool isContentChanged = false; if (tb.IsContent != true) { tb.IsContent = true; isContentChanged = true; } changes = isContentChanged | changes; } } return changes; }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; bool hasChanges = false; IEnumerator<TextBlock> it = textBlocks.GetEnumerator(); if (!it.MoveNext()) { return false; } TextBlock prevBlock = TextBlock.EmptyStart; TextBlock currentBlock = it.Current; TextBlock nextBlock = it.MoveNext() ? it.Current : TextBlock.EmptyStart; hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges; if (nextBlock != TextBlock.EmptyStart) { while (it.MoveNext()) { prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = it.Current; hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges; } prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = TextBlock.EmptyStart; hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges; } return hasChanges; }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; if (textBlocks.Count < 2) { return false; } bool changes = false; TextBlock blockBelow = null; textBlocks.Reverse(); foreach (TextBlock block in textBlocks) { if (blockBelow == null) { blockBelow = block; continue; } IEnumerable<string> labels = block.Labels; if (labels != null) { IList<string> enumerable = labels.ToList(); if (enumerable.Count > 0) { foreach (string l in enumerable) { blockBelow.AddLabel(_labelPrefix + l); } changes = true; } } blockBelow = block; } return changes; }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; if (textBlocks.Count < 2) { return false; } int maxNumWords = -1; TextBlock largestBlock = null; int level = -1; int i = 0; int n = -1; foreach (TextBlock tb in textBlocks) { if (tb.IsContent) { int nw = tb.NumWords; if (nw > maxNumWords) { largestBlock = tb; maxNumWords = nw; n = i; if (_expandToSameLevelText) { level = tb.TagLevel; } } } i++; } foreach (TextBlock tb in textBlocks) { if (tb == largestBlock) { tb.IsContent = true; } else { tb.IsContent = false; tb.AddLabel(DefaultLabels.MIGHT_BE_CONTENT); } } if (_expandToSameLevelText && n != -1) { foreach (TextBlock tb in textBlocks.Skip(n).Reverse()) { int tl = tb.TagLevel; if (tl < level) { break; } if (tl == level) { tb.IsContent = true; } } foreach (TextBlock tb in textBlocks.Skip(n)) { int tl = tb.TagLevel; if (tl < level) { break; } if (tl == level) { tb.IsContent = true; } } } return true; }
public bool Process(TextDocument doc) { bool changes = false; List<TextBlock> blocks = doc.TextBlocks; var blocksNew = new List<TextBlock>(); foreach (TextBlock tb in blocks) { string text = tb.Text; string[] paragraphs = Regex.Split(text, "[\n\r]+"); if (paragraphs.Length < 2) { blocksNew.Add(tb); continue; } bool isContent = tb.IsContent; List<string> labels = (tb.Labels ?? Enumerable.Empty<string>()).ToList(); foreach (String p in paragraphs) { var tbP = new TextBlock(p) { IsContent = isContent }; tbP.AddLabels(labels); blocksNew.Add(tbP); changes = true; } } if (changes) { blocks.Clear(); blocks.AddRange(blocksNew); } return changes; }
public bool Process(TextDocument doc) { bool changes = false; int words = 0; List<TextBlock> blocks = doc.TextBlocks; if (blocks.Count != 0) { blocks.Reverse(); foreach (TextBlock tb in blocks) { if (tb.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT)) { tb.AddLabel(DefaultLabels.STRICTLY_NOT_CONTENT); tb.RemoveLabel(DefaultLabels.MIGHT_BE_CONTENT); tb.IsContent = false; changes = true; } else if (tb.IsContent) { words += tb.NumWords; if (words > 200) { break; } } } } return changes; }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; if (textBlocks.Count < 2) { return false; } TextBlock prevBlock = textBlocks[0]; bool changes; do { changes = false; foreach (TextBlock block in textBlocks.Skip(1).ToList()) { if (prevBlock.IsContent && block.LinkDensity < 0.56 && !block.HasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) { prevBlock.MergeNext(block); textBlocks.Remove(block); changes = true; } else { prevBlock = block; } } } while (changes); return true; }
// public static long timeSpent = 0; public bool Process(TextDocument doc) { bool changes = false; // long t = System.currentTimeMillis(); foreach (TextBlock tb in doc.TextBlocks) { int numWords = tb.NumWords; if (numWords < 15) { string text = tb.Text.Trim(); int len = text.Length; if (len >= 8) { string textLC = text.ToLower(); if (textLC.StartsWith("comments") || StartsWithNumber(textLC, len, " comments", " users responded in") || textLC.StartsWith("© reuters") || textLC.StartsWith("please rate this") || textLC.StartsWith("post a comment") || textLC.Contains("what you think...") || textLC.Contains("add your comment") || textLC.Contains("add comment") || textLC.Contains("reader views") || textLC.Contains("have your say") || textLC.Contains("reader comments") || textLC.Contains("rätta artikeln") || textLC.Equals("thanks for your comments - this feedback is now closed")) { tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT); changes = true; } } } } // timeSpent += System.currentTimeMillis() - t; return changes; }
/// <summary> /// Extracts text from the given <see cref="TextDocument" /> object. /// </summary> /// <param name="doc">The <see cref="TextDocument" />.</param> /// <returns>The extracted text.</returns> /// <exception cref="BoilerpipeProcessingException"></exception> public string GetText(TextDocument doc) { try { Process(doc); return doc.Content; } catch (Exception ex) { throw new BoilerpipeProcessingException(ex.Message, ex); } }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; if (textBlocks.Count < 2) { return false; } bool changes = false; TextBlock prevBlock; int offset; if (_contentOnly) { prevBlock = null; offset = 0; foreach (TextBlock tb in textBlocks) { offset++; if (tb.IsContent) { prevBlock = tb; break; } } if (prevBlock == null) { return false; } } else { prevBlock = textBlocks[0]; offset = 1; } foreach (TextBlock block in textBlocks.Skip(offset).ToList()) { if (!block.IsContent) { prevBlock = block; continue; } int diffBlocks = block.OffsetBlocksStart - prevBlock.OffsetBlocksEnd - 1; if (diffBlocks <= _maxBlocksDistance) { bool ok = true; if (_contentOnly) { if (!prevBlock.IsContent || !block.IsContent) { ok = false; } } if (ok && _sameTagLevelOnly && prevBlock.TagLevel != block.TagLevel) { ok = false; } if (ok) { prevBlock.MergeNext(block); textBlocks.Remove(block); changes = true; } else { prevBlock = block; } } else { prevBlock = block; } } return changes; }
public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.TextBlocks.Where(tb => !tb.IsContent && _labels.Any(tb.HasLabel))) { tb.IsContent = true; changes = true; } return changes; }
public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.TextBlocks.Where(tb => tb.IsContent && tb.NumWords < _minWords)) { tb.IsContent = false; changes = true; } return changes; }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; bool hasChanges = false; foreach (TextBlock textBlock in textBlocks.ToList().Where(textBlock => !textBlock.IsContent)) { textBlocks.Remove(textBlock); hasChanges = true; } return hasChanges; }
public override bool Process(TextDocument doc) { return TerminatingBlocksFinder.Instance.Process(doc) | new DocumentTitleMatchClassifier(doc.Title).Process(doc) | NumWordsRulesClassifier.Instance.Process(doc) | IgnoreBlocksAfterContentFilter.DefaultInstance.Process(doc) | BlockProximityFusion.MaxDistance1.Process(doc) | BoilerplateBlockFilter.Instance.Process(doc) | BlockProximityFusion.MaxDistance1ContentOnly.Process(doc) | KeepLargestBlockFilter.Instance.Process(doc) | ExpandTitleToContentFilter.Instance.Process(doc); }
public bool Process(TextDocument doc) { List<TextBlock> tbs = doc.TextBlocks; if (tbs.Count == 0) { return false; } foreach (TextBlock tb in tbs) { tb.IsContent = !tb.IsContent; } return true; }
public bool Process(TextDocument doc) { bool changed = false; foreach (TextBlock tb in doc.TextBlocks) { if (tb.NumWords > 10) { continue; } string text = tb.Text; foreach (Regex regex in PatternsShort) { if (regex.IsMatch(text)) { changed = true; tb.IsContent = true; tb.AddLabel(DefaultLabels.ARTICLE_METADATA); } } } return changed; }
public bool Process(TextDocument doc) { if (_potentialTitles == null) { return false; } bool changes = false; foreach (TextBlock tb in doc.TextBlocks) { string text = tb.Text.Trim(); foreach (string candidate in _potentialTitles) { if (candidate.Equals(text)) { tb.AddLabel(DefaultLabels.TITLE); changes = true; } } } return changes; }
public bool Process(TextDocument doc) { bool changes = false; int numWords = 0; bool foundEndOfText = false; foreach (TextBlock block in doc.TextBlocks) { bool endOfText = block.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT); if (block.IsContent) { numWords += GetNumFullTextWords(block); } if (endOfText && numWords >= _minNumWords) { foundEndOfText = true; } if (foundEndOfText) { changes = true; block.IsContent = false; } } return changes; }
public bool Process(TextDocument doc) { bool changes = false; foreach (TextBlock tb in doc.TextBlocks) { if (!tb.IsContent) { continue; } string text = tb.Text; Match m = _patClauseDelimiter.Match(text); int start = 0; int end; bool hasClause = false; while (m.Success) { end = m.Index + 1 - start; hasClause = IsClause(text.Substring(start, end)); start += m.Length; if (hasClause) { break; } m = _patClauseDelimiter.Match(text, start); } end = text.Length - start; // since clauses should *always end* with a delimiter, we normally // don't consider text without one if (_acceptClausesWithoutDelimiter) { hasClause |= IsClause(text.Substring(start, end)); } if (!hasClause) { tb.IsContent = false; changes = true; // System.err.println("IS NOT CONTENT: " + text); } } return changes; }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; if (textBlocks.Count < 2) { return false; } bool changes = false; TextBlock prevBlock = textBlocks[0]; const int offset = 1; foreach (TextBlock block in textBlocks.Skip(offset).ToList()) { if (EqualLabels(prevBlock.Labels, block.Labels)) { prevBlock.MergeNext(block); textBlocks.Remove(block); changes = true; } else { prevBlock = block; } } return changes; }
public bool Process(TextDocument doc) { List<TextBlock> textBlocks = doc.TextBlocks; bool changes = false; if (textBlocks.Count < 2) { return false; } TextBlock b1 = textBlocks[0]; foreach (TextBlock b2 in textBlocks.Skip(1).ToList()) { bool similar = (b1.TextDensity == b2.TextDensity); if (similar) { b1.MergeNext(b2); textBlocks.Remove(b2); changes = true; } else { b1 = b2; } } return changes; }
/// <summary> /// Processes the given document <code>doc</code>. /// </summary> /// <param name="doc">The <see cref="TextDocument" /> that is to be processed.</param> /// <returns><code>true</code> if changes have been made to the <see cref="TextDocument" />.</returns> /// <exception cref="BoilerpipeProcessingException"></exception> public abstract bool Process(TextDocument doc);
public override bool Process(TextDocument doc) { return SimpleBlockFusionProcessor.Instance.Process(doc) | MarkEverythingContentFilter.Instance.Process(doc) | _filter.Process(doc); }
public override bool Process(TextDocument doc) { return ArticleExtractor.Instance.Process(doc) | SplitParagraphBlocksFilter.Instance.Process(doc) | MinClauseWordsFilter.Instance.Process(doc); }
public override bool Process(TextDocument doc) { return Classifier.Process(doc); }
public override bool Process(TextDocument doc) { return NumWordsRulesClassifier.Instance.Process(doc) | BlockProximityFusion.MaxDistance1.Process(doc) | KeepLargestBlockFilter.Instance.Process(doc); }
public override bool Process(TextDocument doc) { return SimpleBlockFusionProcessor.Instance.Process(doc) | BlockProximityFusion.MaxDistance1.Process(doc) | DensityRulesClassifier.Instance.Process(doc); }
public override bool Process(TextDocument doc) { return MarkEverythingContentFilter.Instance.Process(doc); }
public override bool Process(TextDocument doc) { return NumWordsRulesClassifier.Instance.Process(doc); }