/// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public virtual bool Process(TextDocument doc)
 {
     IList<TextBlock> tbs = doc.GetTextBlocks();
     if (tbs.Count < 3)
     {
         return false;
     }
     TextBlock a = tbs[0];
     TextBlock b = tbs[1];
     TextBlock c;
     bool hasChanges = false;
     for (ListIterator<TextBlock> it = tbs.ListIterator(2); it.HasNext(); )
     {
         c = it.Next();
         if (!b.IsContent() && a.IsContent() && c.IsContent() && cond.MeetsCondition(b))
         {
             b.SetIsContent(true);
             hasChanges = true;
         }
         a = c;
         if (!it.HasNext())
         {
             break;
         }
         b = it.Next();
     }
     return hasChanges;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     IList<TextBlock> blocks = doc.GetTextBlocks();
     IList<TextBlock> blocksNew = new AList<TextBlock>();
     foreach (TextBlock tb in blocks)
     {
         string text = tb.GetText();
         string[] paragraphs = text.Split("[\n\r]+");
         if (paragraphs.Length < 2)
         {
             blocksNew.AddItem(tb);
             continue;
         }
         bool isContent = tb.IsContent();
         ICollection<string> labels = tb.GetLabels();
         foreach (string p in paragraphs)
         {
             TextBlock tbP = new TextBlock(p);
             tbP.SetIsContent(isContent);
             tbP.AddLabels(labels);
             blocksNew.AddItem(tbP);
             changes = true;
         }
     }
     if (changes)
     {
         blocks.Clear();
         Sharpen.Collections.AddAll(blocks, blocksNew);
     }
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public virtual bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     bool changes = false;
     if (textBlocks.Count < 2)
     {
         return false;
     }
     TextBlock b1 = textBlocks[0];
     for (ListIterator<TextBlock> it = textBlocks.ListIterator(1); it.HasNext(); )
     {
         TextBlock b2 = it.Next();
         bool similar = (b1.GetTextDensity() == b2.GetTextDensity());
         if (similar)
         {
             b1.MergeNext(b2);
             it.Remove();
             changes = true;
         }
         else
         {
             b1 = b2;
         }
     }
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     int numWords = 0;
     bool foundEndOfText = false;
     for (Iterator<TextBlock> it = doc.GetTextBlocks().Iterator(); it.HasNext(); )
     {
         TextBlock block = it.Next();
         bool endOfText = block.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
         if (block.IsContent())
         {
             numWords += GetNumFullTextWords(block);
         }
         if (endOfText && numWords >= minNumWords)
         {
             foundEndOfText = true;
         }
         if (foundEndOfText)
         {
             changes = true;
             block.SetIsContent(false);
         }
     }
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     int i = 0;
     int title = -1;
     int contentStart = -1;
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         if (contentStart == -1 && tb.HasLabel(DefaultLabels.TITLE))
         {
             title = i;
             contentStart = -1;
         }
         if (contentStart == -1 && tb.IsContent())
         {
             contentStart = i;
         }
         i++;
     }
     if (contentStart <= title || title == -1)
     {
         return false;
     }
     bool changes = false;
     foreach (TextBlock tb_1 in doc.GetTextBlocks().SubList(title, contentStart))
     {
         if (tb_1.HasLabel(DefaultLabels.MIGHT_BE_CONTENT))
         {
             changes = tb_1.SetIsContent(true) | changes;
         }
     }
     return changes;
 }
 // public static long timeSpent = 0;
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public virtual bool Process(TextDocument doc)
 {
     bool changes = false;
     // long t = System.currentTimeMillis();
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         int numWords = tb.GetNumWords();
         if (numWords < 15)
         {
             string text = tb.GetText().Trim();
             int len = text.Length;
             if (len >= 8)
             {
                 string textLC = text.ToLower();
                 if (textLC.StartsWith("comments") || StartsWithNumber(textLC, len, " comments", " users responded in"
                     ) || textLC.StartsWith("© reuters") || textLC.StartsWith("please rate this") ||
                      textLC.StartsWith("post a comment") || textLC.Contains("what you think...") ||
                     textLC.Contains("add your comment") || textLC.Contains("add comment") || textLC.
                     Contains("reader views") || textLC.Contains("have your say") || textLC.Contains(
                     "reader comments") || textLC.Contains("rätta artikeln") || textLC.Equals("thanks for your comments - this feedback is now closed"
                     ))
                 {
                     tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT);
                     changes = true;
                 }
             }
         }
     }
     // timeSpent += System.currentTimeMillis() - t;
     return changes;
 }
Esempio n. 7
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks ();
     bool hasChanges = false;
     ListIterator<TextBlock> it = textBlocks.ListIterator ();
     if (!it.HasNext())
     {
         return false;
     }
     TextBlock prevBlock = TextBlock.EMPTY_START;
     TextBlock currentBlock = it.Next();
     TextBlock nextBlock = it.HasNext() ? it.Next() : TextBlock.EMPTY_START;
     hasChanges = this.Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
     if (nextBlock != TextBlock.EMPTY_START)
     {
         while (it.HasNext())
         {
             prevBlock = currentBlock;
             currentBlock = nextBlock;
             nextBlock = it.Next();
             hasChanges = this.Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
         }
         prevBlock = currentBlock;
         currentBlock = nextBlock;
         nextBlock = TextBlock.EMPTY_START;
         hasChanges = this.Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
     }
     return hasChanges;
 }
Esempio n. 8
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     if (textBlocks.Count < 2)
     {
         return false;
     }
     TextBlock prevBlock = textBlocks[0];
     bool changes = false;
     do
     {
         changes = false;
         for (ListIterator<TextBlock> it = textBlocks.ListIterator(1); it.HasNext(); )
         {
             TextBlock block = it.Next();
             if (prevBlock.IsContent() && block.GetLinkDensity() < 0.56 && !block.HasLabel(DefaultLabels
                 .STRICTLY_NOT_CONTENT))
             {
                 prevBlock.MergeNext(block);
                 it.Remove();
                 changes = true;
             }
             else
             {
                 prevBlock = block;
             }
         }
     }
     while (changes);
     return true;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     int words = 0;
     IList<TextBlock> blocks = doc.GetTextBlocks();
     if (!blocks.IsEmpty())
     {
         ListIterator<TextBlock> it = blocks.ListIterator<TextBlock>(blocks.Count);
         TextBlock tb;
         while (it.HasPrevious())
         {
             tb = it.Previous();
             if (tb.HasLabel(DefaultLabels.INDICATES_END_OF_TEXT))
             {
                 tb.AddLabel(DefaultLabels.STRICTLY_NOT_CONTENT);
                 tb.RemoveLabel(DefaultLabels.MIGHT_BE_CONTENT);
                 tb.SetIsContent(false);
                 changes = true;
             }
             else
             {
                 if (tb.IsContent())
                 {
                     words += tb.GetNumWords();
                     if (words > 200)
                     {
                         break;
                     }
                 }
             }
         }
     }
     return changes;
 }
Esempio n. 10
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     if (textBlocks.Count < 2)
     {
         return false;
     }
     bool changes = false;
     TextBlock prevBlock = textBlocks[0];
     int offset = 1;
     for (ListIterator<TextBlock> it = textBlocks.ListIterator(offset); it.HasNext(); )
     {
         TextBlock block = it.Next();
         if (EqualLabels(prevBlock.GetLabels(), block.GetLabels()))
         {
             prevBlock.MergeNext(block);
             it.Remove();
             changes = true;
         }
         else
         {
             prevBlock = block;
         }
     }
     return changes;
 }
Esempio n. 11
0
 /// <summary>
 /// Computes statistics on a given
 /// <see cref="TextDocument">TextDocument</see>
 /// .
 /// </summary>
 /// <param name="doc">
 /// The
 /// <see cref="TextDocument">TextDocument</see>
 /// .
 /// </param>
 /// <param name="contentOnly">if true then o</param>
 public TextDocumentStatistics(TextDocument doc, bool contentOnly)
 {
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         if (contentOnly && !tb.IsContent())
         {
             continue;
         }
         numWords += tb.GetNumWords();
         numBlocks++;
     }
 }
Esempio n. 12
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> tbs = doc.GetTextBlocks();
     if (tbs.IsEmpty())
     {
         return false;
     }
     foreach (TextBlock tb in tbs)
     {
         tb.SetIsContent(!tb.IsContent());
     }
     return true;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         if (!tb.IsContent())
         {
             tb.SetIsContent(true);
             changes = true;
         }
     }
     return changes;
 }
Esempio n. 14
0
        /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
        public override bool Process(TextDocument doc)
        {
            bool ret = TerminatingBlocksFinder.INSTANCE.Process (doc)
                | new DocumentTitleMatchClassifier (doc.GetTitle ()).Process (doc)
                | NumWordsRulesClassifier.INSTANCE.Process (doc)
                | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.Process (doc)
                | BlockProximityFusion.MAX_DISTANCE_1.Process (doc)
                | BoilerplateBlockFilter.INSTANCE.Process (doc)
                | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY.Process (doc)
                | KeepLargestBlockFilter.INSTANCE.Process (doc)
                | ExpandTitleToContentFilter.INSTANCE.Process (doc);

            return ret;
        }
Esempio n. 15
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     bool hasChanges = false;
     for (Iterator<TextBlock> it = textBlocks.Iterator(); it.HasNext(); )
     {
         TextBlock tb = it.Next();
         if (!tb.IsContent())
         {
             it.Remove();
             hasChanges = true;
         }
     }
     return hasChanges;
 }
Esempio n. 16
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         if (!tb.IsContent())
         {
             continue;
         }
         if (tb.GetNumWords() < minWords)
         {
             tb.SetIsContent(false);
             changes = true;
         }
     }
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     foreach (TextBlock tb in doc.GetTextBlocks()) {
         if (tb.IsContent ()) {
             foreach (string label in labels) {
                 if (tb.HasLabel (label)) {
                     tb.SetIsContent (false);
                     changes = true;
                     goto BLOCK_LOOP_continue;
                 }
             }
             BLOCK_LOOP_continue: {}
         }
     }
     return changes;
 }
Esempio n. 18
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         if (!tb.IsContent())
         {
             continue;
         }
         string text = tb.GetText();
         Matcher m = PAT_CLAUSE_DELIMITER.Matcher(text);
         bool found = m.Find();
         int start = 0;
         int end;
         bool hasClause = false;
         while (found)
         {
             end = m.Start() + 1;
             hasClause = IsClause(text.SubSequence(start, end));
             start = m.End();
             if (hasClause)
             {
                 break;
             }
             found = m.Find();
         }
         end = text.Length;
         // since clauses should *always end* with a delimiter, we normally
         // don't consider text without one
         if (acceptClausesWithoutDelimiter)
         {
             hasClause |= IsClause(text.SubSequence(start, end));
         }
         if (!hasClause)
         {
             tb.SetIsContent(false);
             changes = true;
         }
     }
     // System.err.println("IS NOT CONTENT: " + text);
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     if (textBlocks.Count < 2)
     {
         return false;
     }
     int max = -1;
     TextBlock largestBlock = null;
     int index = 0;
     foreach (TextBlock tb in textBlocks)
     {
         if (!tb.IsContent())
         {
             continue;
         }
         int numWords = GetNumFullTextWords(tb);
         if (numWords > max)
         {
             largestBlock = tb;
             max = numWords;
         }
         index++;
     }
     if (largestBlock == null)
     {
         return false;
     }
     foreach (TextBlock tb_1 in textBlocks)
     {
         if (tb_1 == largestBlock)
         {
             tb_1.SetIsContent(true);
         }
         else
         {
             tb_1.SetIsContent(false);
             tb_1.AddLabel(DefaultLabels.MIGHT_BE_CONTENT);
         }
     }
     return true;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public virtual bool Process(TextDocument doc)
 {
     bool changed = false;
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         if (tb.GetNumWords() > 10)
         {
             continue;
         }
         string text = tb.GetText();
         foreach (Sharpen.Pattern p in PATTERNS_SHORT)
         {
             if (p.Matcher(text).Find())
             {
                 changed = true;
                 tb.SetIsContent(true);
                 tb.AddLabel(DefaultLabels.ARTICLE_METADATA);
             }
         }
     }
     return changed;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     if (textBlocks.Count < 2)
     {
         return false;
     }
     bool changes = false;
     int remaining = textBlocks.Count;
     TextBlock blockBelow = null;
     TextBlock block;
     for (ListIterator<TextBlock> it = textBlocks.ListIterator<TextBlock>(textBlocks.Count); it.HasPrevious
         (); )
     {
         if (--remaining <= 0)
         {
             break;
         }
         if (blockBelow == null)
         {
             blockBelow = it.Previous();
             continue;
         }
         block = it.Previous();
         ICollection<string> labels = block.GetLabels();
         if (labels != null && !labels.IsEmpty())
         {
             foreach (string l in labels)
             {
                 blockBelow.AddLabel(labelPrefix + l);
             }
             changes = true;
         }
         blockBelow = block;
     }
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public override bool Process(TextDocument doc)
 {
     return NumWordsRulesClassifier.INSTANCE.Process(doc) | BlockProximityFusion.MAX_DISTANCE_1
         .Process(doc) | KeepLargestBlockFilter.INSTANCE.Process(doc);
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     if (potentialTitles == null)
     {
         return false;
     }
     bool changes = false;
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         string text = tb.GetText().Trim();
         foreach (string candidate in potentialTitles)
         {
             if (candidate.Equals(text))
             {
                 tb.AddLabel(DefaultLabels.TITLE);
                 changes = true;
             }
         }
     }
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     if (textBlocks.Count < 2)
     {
         return false;
     }
     int maxNumWords = -1;
     TextBlock largestBlock = null;
     int level = -1;
     int i = 0;
     int n = -1;
     foreach (TextBlock tb in textBlocks)
     {
         if (tb.IsContent())
         {
             int nw = tb.GetNumWords();
             if (nw > maxNumWords)
             {
                 largestBlock = tb;
                 maxNumWords = nw;
                 n = i;
                 if (expandToSameLevelText)
                 {
                     level = tb.GetTagLevel();
                 }
             }
         }
         i++;
     }
     foreach (TextBlock tb_1 in textBlocks)
     {
         if (tb_1 == largestBlock)
         {
             tb_1.SetIsContent(true);
         }
         else
         {
             tb_1.SetIsContent(false);
             tb_1.AddLabel(DefaultLabels.MIGHT_BE_CONTENT);
         }
     }
     if (expandToSameLevelText && n != -1)
     {
         for (ListIterator<TextBlock> it = textBlocks.ListIterator(n); it.HasPrevious(); )
         {
             TextBlock tb_2 = it.Previous();
             int tl = tb_2.GetTagLevel();
             if (tl < level)
             {
                 break;
             }
             else
             {
                 if (tl == level)
                 {
                     tb_2.SetIsContent(true);
                 }
             }
         }
         for (ListIterator<TextBlock> it_1 = textBlocks.ListIterator(n); it_1.HasNext(); )
         {
             TextBlock tb_2 = it_1.Next();
             int tl = tb_2.GetTagLevel();
             if (tl < level)
             {
                 break;
             }
             else
             {
                 if (tl == level)
                 {
                     tb_2.SetIsContent(true);
                 }
             }
         }
     }
     return true;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public override bool Process(TextDocument doc)
 {
     return MarkEverythingContentFilter.INSTANCE.Process(doc);
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public override bool Process(TextDocument doc)
 {
     return ArticleExtractor.INSTANCE.Process(doc) | SplitParagraphBlocksFilter.INSTANCE
         .Process(doc) | MinClauseWordsFilter.INSTANCE.Process(doc);
 }
Esempio n. 27
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public override bool Process(TextDocument doc)
 {
     return SimpleBlockFusionProcessor.INSTANCE.Process (doc)
            | BlockProximityFusion.MAX_DISTANCE_1.Process (doc)
            | DensityRulesClassifier.INSTANCE.Process (doc);
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     IList<TextBlock> textBlocks = doc.GetTextBlocks();
     if (textBlocks.Count < 2)
     {
         return false;
     }
     bool changes = false;
     TextBlock prevBlock;
     int offset;
     if (contentOnly)
     {
         prevBlock = null;
         offset = 0;
         foreach (TextBlock tb in textBlocks)
         {
             offset++;
             if (tb.IsContent())
             {
                 prevBlock = tb;
                 break;
             }
         }
         if (prevBlock == null)
         {
             return false;
         }
     }
     else
     {
         prevBlock = textBlocks[0];
         offset = 1;
     }
     for (ListIterator<TextBlock> it = textBlocks.ListIterator<TextBlock>(offset); it.HasNext(); )
     {
         TextBlock block = it.Next();
         if (!block.IsContent())
         {
             prevBlock = block;
             continue;
         }
         int diffBlocks = block.GetOffsetBlocksStart() - prevBlock.GetOffsetBlocksEnd() - 1;
         if (diffBlocks <= maxBlocksDistance)
         {
             bool ok = true;
             if (contentOnly)
             {
                 if (!prevBlock.IsContent() || !block.IsContent())
                 {
                     ok = false;
                 }
             }
             if (ok && sameTagLevelOnly && prevBlock.GetTagLevel() != block.GetTagLevel())
             {
                 ok = false;
             }
             if (ok)
             {
                 prevBlock.MergeNext(block);
                 it.Remove();
                 changes = true;
             }
             else
             {
                 prevBlock = block;
             }
         }
         else
         {
             prevBlock = block;
         }
     }
     return changes;
 }
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public override bool Process(TextDocument doc)
 {
     return NumWordsRulesClassifier.INSTANCE.Process(doc);
 }
Esempio n. 30
0
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public override bool Process(TextDocument doc)
 {
     return CLASSIFIER.Process(doc);
 }