public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      if (textBlocks.Count < 2) {
        return false;
      }

      int max = -1;
      TextBlock largestBlock = null;;
      foreach (TextBlock tb in textBlocks) {
        if (!tb.IsContent) {
          continue;
        }
        int numWords = GetNumFullTextWords(tb);
        if (numWords > max) {
          largestBlock = tb;
          max = numWords;
        }
      }

      if (largestBlock == null) {
        return false;
      }

      foreach (TextBlock tb in textBlocks) {
        if (tb == largestBlock) {
          tb.IsContent = true;
        } else {
          tb.IsContent = false;
          tb.AddLabel(DefaultLabels.MIGHT_BE_CONTENT);
        }
      }

      return true;
    }
    public bool Process(TextDocument doc) {
      int i = 0;
      int title = -1;
      int contentStart = -1;
      foreach (TextBlock tb in doc.TextBlocks) {
        if (contentStart == -1 && tb.HasLabel(DefaultLabels.TITLE)) {
          title = i;
          contentStart = -1;
        }
        if (contentStart == -1 && tb.IsContent) {
          contentStart = i;
        }

        i++;
      }

      if (contentStart <= title || title == -1) {
        return false;
      }
      bool changes = false;
      foreach (TextBlock tb in doc.TextBlocks.Skip(title).Take(contentStart-title)) {
        if (tb.HasLabel(DefaultLabels.MIGHT_BE_CONTENT)) {
          bool isContentChanged = false;
          if (tb.IsContent != true) {
            tb.IsContent = true;
            isContentChanged = true;
          }
          changes = isContentChanged | changes;
        }
      }
      return changes;
    }
      public bool Process(TextDocument doc) {
        List<TextBlock> textBlocks = doc.TextBlocks;
        bool hasChanges = false;

        IEnumerator<TextBlock> it = textBlocks.GetEnumerator();
        if (!it.MoveNext()) {
          return false;
        }
        TextBlock prevBlock = TextBlock.EmptyStart;
        TextBlock currentBlock = it.Current;
        TextBlock nextBlock = it.MoveNext() ? it.Current : TextBlock.EmptyStart;

        hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges;

        if (nextBlock != TextBlock.EmptyStart) {
          while (it.MoveNext()) {
            prevBlock = currentBlock;
            currentBlock = nextBlock;
            nextBlock = it.Current;
            hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
          }
          prevBlock = currentBlock;
          currentBlock = nextBlock;
          nextBlock = TextBlock.EmptyStart;
          hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
        }

        return hasChanges;
      }
    public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      if (textBlocks.Count < 2) {
        return false;
      }

      bool changes = false;
      TextBlock blockBelow = null;

      textBlocks.Reverse();
      foreach (TextBlock block in textBlocks) {
        if (blockBelow == null) {
          blockBelow = block;
          continue;
        }

        IEnumerable<string> labels = block.Labels;
        if (labels != null) {
          IList<string> enumerable = labels.ToList();
          if (enumerable.Count > 0) {
            foreach (string l in enumerable) {
              blockBelow.AddLabel(_labelPrefix + l);
            }
            changes = true;
          }
        }
        blockBelow = block;
      }

      return changes;
    }
    public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      if (textBlocks.Count < 2) {
        return false;
      }

      int maxNumWords = -1;
      TextBlock largestBlock = null;

      int level = -1;

      int i = 0;
      int n = -1;
      foreach (TextBlock tb in textBlocks) {
        if (tb.IsContent) {
          int nw = tb.NumWords;
          if (nw > maxNumWords) {
            largestBlock = tb;
            maxNumWords = nw;

            n = i;

            if (_expandToSameLevelText) {
              level = tb.TagLevel;
            }
          }
        }
        i++;
      }
      foreach (TextBlock tb in textBlocks) {
        if (tb == largestBlock) {
          tb.IsContent = true;
        } else {
          tb.IsContent = false;
          tb.AddLabel(DefaultLabels.MIGHT_BE_CONTENT);
        }
      }
      if (_expandToSameLevelText && n != -1) {
        foreach (TextBlock tb in textBlocks.Skip(n).Reverse()) {
          int tl = tb.TagLevel;
          if (tl < level) {
            break;
          }
          if (tl == level) {
            tb.IsContent = true;
          }
        }
        foreach (TextBlock tb in textBlocks.Skip(n)) {
          int tl = tb.TagLevel;
          if (tl < level) {
            break;
          }
          if (tl == level) {
            tb.IsContent = true;
          }
        }
      }

      return true;
    }
    public bool Process(TextDocument doc) {
      bool changes = false;

      List<TextBlock> blocks = doc.TextBlocks;
      var blocksNew = new List<TextBlock>();

      foreach (TextBlock tb in blocks) {
        string text = tb.Text;
        string[] paragraphs = Regex.Split(text, "[\n\r]+");
        if (paragraphs.Length < 2) {
          blocksNew.Add(tb);
          continue;
        }
        bool isContent = tb.IsContent;
        List<string> labels = (tb.Labels ?? Enumerable.Empty<string>()).ToList();
        foreach (String p in paragraphs) {
          var tbP = new TextBlock(p) { IsContent = isContent };
          tbP.AddLabels(labels);
          blocksNew.Add(tbP);
          changes = true;
        }
      }

      if (changes) {
        blocks.Clear();
        blocks.AddRange(blocksNew);
      }

      return changes;
    }
    public bool Process(TextDocument doc) {
      bool changes = false;
      int words = 0;

      List<TextBlock> blocks = doc.TextBlocks;
      if (blocks.Count != 0) {
        blocks.Reverse();

        foreach (TextBlock tb in blocks) {
          if (tb.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT)) {
            tb.AddLabel(DefaultLabels.STRICTLY_NOT_CONTENT);
            tb.RemoveLabel(DefaultLabels.MIGHT_BE_CONTENT);
            tb.IsContent = false;
            changes = true;
          } else if (tb.IsContent) {
            words += tb.NumWords;
            if (words > 200) {
              break;
            }
          }
        }
      }

      return changes;
    }
Example #8
0
    public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      if (textBlocks.Count < 2) {
        return false;
      }

      TextBlock prevBlock = textBlocks[0];

      bool changes;
      do {
        changes = false;
        foreach (TextBlock block in textBlocks.Skip(1).ToList()) {
          if (prevBlock.IsContent && block.LinkDensity < 0.56 && !block.HasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) {
            prevBlock.MergeNext(block);
            textBlocks.Remove(block);
            changes = true;
          } else {
            prevBlock = block;
          }
        }
      }
      while (changes);

      return true;
    }
    // public static long timeSpent = 0;

    public bool   Process(TextDocument doc) {
      bool changes = false;

      // long t = System.currentTimeMillis();

      foreach (TextBlock tb in doc.TextBlocks) {
        int numWords = tb.NumWords;
        if (numWords < 15) {
          string text = tb.Text.Trim();
          int len = text.Length;
          if (len >= 8) {
            string textLC = text.ToLower();
            if (textLC.StartsWith("comments") || StartsWithNumber(textLC, len, " comments", " users responded in")
                || textLC.StartsWith("© reuters") || textLC.StartsWith("please rate this")
                || textLC.StartsWith("post a comment") || textLC.Contains("what you think...")
                || textLC.Contains("add your comment") || textLC.Contains("add comment")
                || textLC.Contains("reader views") || textLC.Contains("have your say")
                || textLC.Contains("reader comments") || textLC.Contains("rätta artikeln")
                || textLC.Equals("thanks for your comments - this feedback is now closed")) {
              tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT);
              changes = true;
            }
          }
        }
      }

      // timeSpent += System.currentTimeMillis() - t;

      return changes;
    }
Example #10
0
 /// <summary>
 ///     Extracts text from the given <see cref="TextDocument" /> object.
 /// </summary>
 /// <param name="doc">The <see cref="TextDocument" />.</param>
 /// <returns>The extracted text.</returns>
 /// <exception cref="BoilerpipeProcessingException"></exception>
 public string GetText(TextDocument doc) {
   try {
     Process(doc);
     return doc.Content;
   } catch (Exception ex) {
     throw new BoilerpipeProcessingException(ex.Message, ex);
   }
 }
    public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      if (textBlocks.Count < 2) {
        return false;
      }

      bool changes = false;
      TextBlock prevBlock;

      int offset;
      if (_contentOnly) {
        prevBlock = null;
        offset = 0;
        foreach (TextBlock tb in textBlocks) {
          offset++;
          if (tb.IsContent) {
            prevBlock = tb;
            break;
          }
        }
        if (prevBlock == null) {
          return false;
        }
      } else {
        prevBlock = textBlocks[0];
        offset = 1;
      }

      foreach (TextBlock block in textBlocks.Skip(offset).ToList()) {
        if (!block.IsContent) {
          prevBlock = block;
          continue;
        }
        int diffBlocks = block.OffsetBlocksStart - prevBlock.OffsetBlocksEnd - 1;
        if (diffBlocks <= _maxBlocksDistance) {
          bool ok = true;
          if (_contentOnly) {
            if (!prevBlock.IsContent || !block.IsContent) {
              ok = false;
            }
          }
          if (ok && _sameTagLevelOnly && prevBlock.TagLevel != block.TagLevel) {
            ok = false;
          }
          if (ok) {
            prevBlock.MergeNext(block);
            textBlocks.Remove(block);
            changes = true;
          } else {
            prevBlock = block;
          }
        } else {
          prevBlock = block;
        }
      }

      return changes;
    }
    public bool Process(TextDocument doc) {
      bool changes = false;

      foreach (TextBlock tb in doc.TextBlocks.Where(tb => !tb.IsContent && _labels.Any(tb.HasLabel))) {
        tb.IsContent = true;
        changes = true;
      }

      return changes;
    }
Example #13
0
    public bool Process(TextDocument doc) {
      bool changes = false;

      foreach (TextBlock tb in doc.TextBlocks.Where(tb => tb.IsContent && tb.NumWords < _minWords)) {
        tb.IsContent = false;
        changes = true;
      }

      return changes;
    }
    public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      bool hasChanges = false;

      foreach (TextBlock textBlock in textBlocks.ToList().Where(textBlock => !textBlock.IsContent)) {
        textBlocks.Remove(textBlock);
        hasChanges = true;
      }

      return hasChanges;
    }
 public override bool Process(TextDocument doc) {
   return TerminatingBlocksFinder.Instance.Process(doc)
          | new DocumentTitleMatchClassifier(doc.Title).Process(doc)
          | NumWordsRulesClassifier.Instance.Process(doc)
          | IgnoreBlocksAfterContentFilter.DefaultInstance.Process(doc)
          | BlockProximityFusion.MaxDistance1.Process(doc)
          | BoilerplateBlockFilter.Instance.Process(doc)
          | BlockProximityFusion.MaxDistance1ContentOnly.Process(doc)
          | KeepLargestBlockFilter.Instance.Process(doc)
          | ExpandTitleToContentFilter.Instance.Process(doc);
 }
Example #16
0
    public bool Process(TextDocument doc) {
      List<TextBlock> tbs = doc.TextBlocks;
      if (tbs.Count == 0) {
        return false;
      }
      foreach (TextBlock tb in tbs) {
        tb.IsContent = !tb.IsContent;
      }

      return true;
    }
 public bool Process(TextDocument doc) {
   bool changed = false;
   foreach (TextBlock tb in doc.TextBlocks) {
     if (tb.NumWords > 10) {
       continue;
     }
     string text = tb.Text;
     foreach (Regex regex in PatternsShort) {
       if (regex.IsMatch(text)) {
         changed = true;
         tb.IsContent = true;
         tb.AddLabel(DefaultLabels.ARTICLE_METADATA);
       }
     }
   }
   return changed;
 }
    public bool Process(TextDocument doc) {
      if (_potentialTitles == null) {
        return false;
      }
      bool changes = false;

      foreach (TextBlock tb in doc.TextBlocks) {
        string text = tb.Text.Trim();
        foreach (string candidate in _potentialTitles) {
          if (candidate.Equals(text)) {
            tb.AddLabel(DefaultLabels.TITLE);
            changes = true;
          }
        }
      }

      return changes;
    }
    public bool Process(TextDocument doc) {
      bool changes = false;

      int numWords = 0;
      bool foundEndOfText = false;
      foreach (TextBlock block in doc.TextBlocks) {
        bool endOfText = block.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
        if (block.IsContent) {
          numWords += GetNumFullTextWords(block);
        }
        if (endOfText && numWords >= _minNumWords) {
          foundEndOfText = true;
        }
        if (foundEndOfText) {
          changes = true;
          block.IsContent = false;
        }
      }

      return changes;
    }
    public bool Process(TextDocument doc) {
      bool changes = false;
      foreach (TextBlock tb in doc.TextBlocks) {
        if (!tb.IsContent) {
          continue;
        }
        string text = tb.Text;

        Match m = _patClauseDelimiter.Match(text);
        int start = 0;
        int end;
        bool hasClause = false;
        while (m.Success) {
          end = m.Index + 1 - start;
          hasClause = IsClause(text.Substring(start, end));
          start += m.Length;

          if (hasClause) {
            break;
          }
          m = _patClauseDelimiter.Match(text, start);
        }
        end = text.Length - start;

        // since clauses should *always end* with a delimiter, we normally
        // don't consider text without one
        if (_acceptClausesWithoutDelimiter) {
          hasClause |= IsClause(text.Substring(start, end));
        }

        if (!hasClause) {
          tb.IsContent = false;
          changes = true;
          // System.err.println("IS NOT CONTENT: " + text);
        }
      }

      return changes;
    }
Example #21
0
    public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      if (textBlocks.Count < 2) {
        return false;
      }

      bool changes = false;
      TextBlock prevBlock = textBlocks[0];
      const int offset = 1;

      foreach (TextBlock block in textBlocks.Skip(offset).ToList()) {
        if (EqualLabels(prevBlock.Labels, block.Labels)) {
          prevBlock.MergeNext(block);
          textBlocks.Remove(block);
          changes = true;
        } else {
          prevBlock = block;
        }
      }

      return changes;
    }
    public bool Process(TextDocument doc) {
      List<TextBlock> textBlocks = doc.TextBlocks;
      bool changes = false;

      if (textBlocks.Count < 2) {
        return false;
      }

      TextBlock b1 = textBlocks[0];
      foreach (TextBlock b2 in textBlocks.Skip(1).ToList()) {
        bool similar = (b1.TextDensity == b2.TextDensity);

        if (similar) {
          b1.MergeNext(b2);
          textBlocks.Remove(b2);
          changes = true;
        } else {
          b1 = b2;
        }
      }

      return changes;
    }
Example #23
0
 /// <summary>
 ///     Processes the given document <code>doc</code>.
 /// </summary>
 /// <param name="doc">The <see cref="TextDocument" /> that is to be processed.</param>
 /// <returns><code>true</code> if changes have been made to the <see cref="TextDocument" />.</returns>
 /// <exception cref="BoilerpipeProcessingException"></exception>
 public abstract bool Process(TextDocument doc);
 public override bool Process(TextDocument doc) {
   return SimpleBlockFusionProcessor.Instance.Process(doc)
          | MarkEverythingContentFilter.Instance.Process(doc)
          | _filter.Process(doc);
 }
 public override bool Process(TextDocument doc) {
   return ArticleExtractor.Instance.Process(doc)
          | SplitParagraphBlocksFilter.Instance.Process(doc)
          | MinClauseWordsFilter.Instance.Process(doc);
 }
 public override bool Process(TextDocument doc) {
   return Classifier.Process(doc);
 }
 public override bool Process(TextDocument doc) {
   return NumWordsRulesClassifier.Instance.Process(doc)
          | BlockProximityFusion.MaxDistance1.Process(doc)
          | KeepLargestBlockFilter.Instance.Process(doc);
 }
 public override bool Process(TextDocument doc) {
   return SimpleBlockFusionProcessor.Instance.Process(doc)
          | BlockProximityFusion.MaxDistance1.Process(doc)
          | DensityRulesClassifier.Instance.Process(doc);
 }
 public override bool Process(TextDocument doc) {
   return MarkEverythingContentFilter.Instance.Process(doc);
 }
 public override bool Process(TextDocument doc) {
   return NumWordsRulesClassifier.Instance.Process(doc);
 }