/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public virtual bool Process(TextDocument doc)
		{
			IList<TextBlock> textBlocks = doc.GetTextBlocks();
			bool hasChanges = false;
            var it = textBlocks.GetEnumerator();
			if (!it.MoveNext())
			{
				return false;
			}
			TextBlock prevBlock = TextBlock.EMPTY_START;
            TextBlock currentBlock = it.Current;
			TextBlock nextBlock = it.MoveNext() ? it.Current : TextBlock.EMPTY_START;
			hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
			if (nextBlock != TextBlock.EMPTY_START)
			{
                while (it.MoveNext())
				{
					prevBlock = currentBlock;
					currentBlock = nextBlock;
					nextBlock = it.Current;
					hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
				}
				prevBlock = currentBlock;
				currentBlock = nextBlock;
				nextBlock = TextBlock.EMPTY_START;
				hasChanges = Classify(prevBlock, currentBlock, nextBlock) | hasChanges;
			}
			return hasChanges;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			bool changes = false;
			IList<TextBlock> blocks = doc.GetTextBlocks();
			IList<TextBlock> blocksNew = new List<TextBlock>();
			foreach (TextBlock tb in blocks)
			{
				string text = tb.GetText();
				string[] paragraphs = text.Split('[', '\n', '\r', ']', '+');
				if (paragraphs.Length < 2)
				{
					blocksNew.Add(tb);
					continue;
				}
				bool isContent = tb.IsContent();
				ICollection<string> labels = tb.GetLabels();
				foreach (string p in paragraphs)
				{
					TextBlock tbP = new TextBlock(p);
					tbP.SetIsContent(isContent);
					tbP.AddLabels(labels);
                    blocksNew.Add(tbP);
					changes = true;
				}
			}
			if (changes)
			{
				blocks.Clear();
                foreach (var block in blocksNew)
                    blocks.Add(block);
			}
			return changes;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
            IList<TextBlock> textBlocks = doc.GetTextBlocks();
            bool changes = false;
            if (textBlocks.Count < 2)
            {
                return false;
            }
            TextBlock b1 = textBlocks[0];

            do
            {
                foreach (var b2 in new List<TextBlock>(textBlocks.Skip(1)))
                {
                    if (b1.IsContent() && b2.GetLinkDensity() < 0.56 && !b2.HasLabel(DefaultLabels
						    .STRICTLY_NOT_CONTENT))
				    {
                        b1.MergeNext(b2);
                        textBlocks.Remove(b2);
					    changes = true;
				    }
				    else
				    {
                        b1 = b2;
				    }
				
                }
            }
            while (changes);
            return true;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public virtual bool Process(TextDocument doc)
		{
			IList<TextBlock> tbs = doc.GetTextBlocks();
			if (tbs.Count < 3)
			{
				return false;
			}
			TextBlock a = tbs[0];
			TextBlock b = tbs[1];
			TextBlock c;
			bool hasChanges = false;
            var it = tbs.Skip(2).GetEnumerator();
            it.MoveNext();
            for(;;)
            {
                c = it.Current;
                if (!b.IsContent() && a.IsContent() && c.IsContent() && cond.MeetsCondition(b))
                {
                    b.SetIsContent(true);
                    hasChanges = true;
                }
                a = c;
                if (!it.MoveNext())
                {
                    break;
                }
                b = it.Current;
            }
			return hasChanges;
		}
		public bool Process(TextDocument doc)
        {
			var changes = false;
	
			int tagLevel = Int32.MaxValue;
			foreach (var tb in doc.GetTextBlocks()) 
            {
				if (tb.IsContent() && tb.HasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) 
                {
					tagLevel = tb.GetTagLevel();
				} 
                else 
                {
					if (tb.GetTagLevel() > tagLevel
						&& tb.HasLabel(DefaultLabels.MIGHT_BE_CONTENT)
						&& tb.HasLabel(DefaultLabels.LI)
						&& tb.GetLinkDensity() == 0) 
                    {
						tb.SetIsContent(true);
						changes = true;
					} 
                    else 
                    {
                        tagLevel = Int32.MaxValue;
					}
				}
			}
	
			return changes;
	
		}
        public bool Process(TextDocument doc)
        {
            var changes = false;

            int tagLevel = -1;
            foreach (var tb in doc.GetTextBlocks())
            {
                if (tb.IsContent() && tb.HasLabel(DefaultLabels.VERY_LIKELY_CONTENT))
                {
                    tagLevel = tb.GetTagLevel();
                    break;
                }

                if (tagLevel == -1)
                {
                    return false;
                }
            }

            foreach (var tb in doc.GetTextBlocks())
            {
                if (!tb.IsContent())
                {

                    if (tb.GetNumWords() >= 100 && tb.GetTagLevel() == tagLevel)
                    {
                        tb.SetIsContent(true);
                        changes = true;
                    }
                }
            }
            return changes;
        }
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			IList<TextBlock> textBlocks = doc.GetTextBlocks();
            var removeMe = textBlocks.Where(tb => !tb.IsContent() && (labelToKeep == null || !tb.HasLabel(DefaultLabels.TITLE))).ToList();

            foreach (var tb in removeMe)
			{
                textBlocks.Remove(tb);
			}
            return removeMe.Count > 0;
		}
		/// <summary>
		/// Computes statistics on a given
		/// <see cref="TextDocument">TextDocument</see>
		/// .
		/// </summary>
		/// <param name="doc">
		/// The
		/// <see cref="TextDocument">TextDocument</see>
		/// .
		/// </param>
		/// <param name="contentOnly">if true then o</param>
		public TextDocumentStatistics(TextDocument doc, bool contentOnly)
		{
			foreach (TextBlock tb in doc.GetTextBlocks())
			{
				if (contentOnly && !tb.IsContent())
				{
					continue;
				}
				numWords += tb.GetNumWords();
				numBlocks++;
			}
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			bool changes = false;
			foreach (TextBlock tb in doc.GetTextBlocks())
			{
				if (!tb.IsContent())
				{
					tb.SetIsContent(true);
					changes = true;
				}
			}
			return changes;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process (TextDocument doc)
		{
			bool changes = false;
			foreach (TextBlock tb in doc.GetTextBlocks()) {
				if (tb.IsContent ()) {
					foreach (string label in labels) {
						if (tb.HasLabel (label)) {
							tb.SetIsContent (false);
							changes = true;
							goto BLOCK_LOOP_continue;
						}
					}
					BLOCK_LOOP_continue: {}
				}
			}
			return changes;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			bool changes = false;
			foreach (TextBlock tb in doc.GetTextBlocks())
			{
				if (!tb.IsContent())
				{
					continue;
				}
				if (GetNumFullTextWords(tb) < minWords)
				{
					tb.SetIsContent(false);
					changes = true;
				}
			}
			return changes;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public override bool Process (TextDocument doc)
		{
			
			bool ret = TerminatingBlocksFinder.INSTANCE.Process (doc) 
				| new DocumentTitleMatchClassifier (doc.GetTitle ()).Process (doc) 
				| NumWordsRulesClassifier.INSTANCE.Process (doc) 
				| IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.Process (doc) 
                | TrailingHeadlineToBoilerplateFilter.INSTANCE.Process(doc)
				| BlockProximityFusion.MAX_DISTANCE_1.Process (doc) 
				| BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.Process (doc) 
				| BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.Process (doc) 
				| KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.Process (doc) 
				| ExpandTitleToContentFilter.INSTANCE.Process (doc)
                | LargeBlockSameTagLevelToContentFilter.INSTANCE.Process(doc)
                | ListAtEndFilter.INSTANCE.Process(doc);
			
			return ret;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public virtual bool Process(TextDocument doc)
		{
			bool changed = false;
			foreach (TextBlock tb in doc.GetTextBlocks())
			{
				if (tb.GetNumWords() > 10)
				{
					continue;
				}
				string text = tb.GetText();
				foreach (Sharpen.Pattern p in PATTERNS_SHORT)
				{
					if (p.Matcher(text).Find())
					{
						changed = true;
						tb.SetIsContent(true);
						tb.AddLabel(DefaultLabels.ARTICLE_METADATA);
					}
				}
			}
			return changed;
		}
		// public static long timeSpent = 0;
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public virtual bool Process(TextDocument doc)
		{
			bool changes = false;
			// long t = System.currentTimeMillis();
			foreach (TextBlock tb in doc.GetTextBlocks())
			{
				int numWords = tb.GetNumWords();
				if (numWords < 15)
				{
					string text = tb.GetText().Trim();
					int len = text.Length;
					if (len >= 8)
					{
						string textLC = text.ToLower();
						if (textLC.StartsWith("references") || StartsWithNumber(textLC, len, " comments", " users responded in"
							) || textLC.StartsWith("© reuters") || textLC.StartsWith("please rate this") ||
							 textLC.StartsWith("post a comment") || textLC.Contains("what you think...") || 
							textLC.Contains("add your comment") || textLC.Contains("add comment") || textLC.
							Contains("reader views") || textLC.Contains("have your say") || textLC.Contains(
							"reader comments") || textLC.Contains("rätta artikeln") || textLC.Equals("thanks for your comments - this feedback is now closed"
							))
						{
							tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT);
							changes = true;
						}
					}
                    else if (tb.GetLinkDensity() == 1.0)
                    {
                        if (text == "Comment")
                        {
                            tb.AddLabel(DefaultLabels.INDICATES_END_OF_TEXT);
                            changes = true;
                        }
                    }
				}
			}
			// timeSpent += System.currentTimeMillis() - t;
			return changes;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			IList<TextBlock> textBlocks = doc.GetTextBlocks();
			if (textBlocks.Count < 2)
			{
				return false;
			}
			bool changes = false;
			int remaining = textBlocks.Count;
			TextBlock blockBelow = null;
			TextBlock block;
            for (var it = textBlocks.Reverse().GetEnumerator(); it.MoveNext(); )
            {
                if (--remaining <= 0)
                {
                    break;
                }
                if (blockBelow == null)
                {

                    blockBelow = it.Current;
                    continue;
                }
                block = it.Current;
                ICollection<string> labels = block.GetLabels();
                if (labels != null && labels.Count != 0)
                {
                    foreach (string l in labels)
                    {
                        blockBelow.AddLabel(labelPrefix + l);
                    }
                    changes = true;
                }
                blockBelow = block;
            }
			return changes;
		}
Example #16
0
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			IList<TextBlock> textBlocks = doc.GetTextBlocks();
			if (textBlocks.Count < 2)
			{
				return false;
			}
			bool changes = false;
			TextBlock prevBlock = textBlocks[0];
			foreach (var block in new List<TextBlock>(textBlocks.Skip(1)))
			{
				if (EqualLabels(prevBlock.GetLabels(), block.GetLabels()))
				{
					prevBlock.MergeNext(block);
                    textBlocks.Remove(block);
					changes = true;
				}
				else
				{
					prevBlock = block;
				}
			}
			return changes;
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			IList<TextBlock> textBlocks = doc.GetTextBlocks();
			if (textBlocks.Count < 2)
			{
				return false;
			}
			int maxNumWords = -1;
			TextBlock largestBlock = null;
			int level = -1;
			int i = 0;
			int n = -1;
			foreach (TextBlock tb in textBlocks)
			{
				if (tb.IsContent())
				{
					int nw = tb.GetNumWords();
					if (nw > maxNumWords)
					{
						largestBlock = tb;
						maxNumWords = nw;
						n = i;
						if (expandToSameLevelText)
						{
							level = tb.GetTagLevel();
						}
					}
				}
				i++;
			}
			foreach (TextBlock tb in textBlocks)
			{
				if (tb == largestBlock)
				{
					tb.SetIsContent(true);
				}
				else
				{
					tb.SetIsContent(false);
					tb.AddLabel(DefaultLabels.MIGHT_BE_CONTENT);
				}
			}
			if (expandToSameLevelText && n != -1)
			{
                foreach (var tb in textBlocks.Take(n).Reverse())
                {
                    int tl = tb.GetTagLevel();
                    if (tl < level)
                    {
                        break;
                    }
                    else
                    {
                        if (tl == level)
                        {
                            if(tb.GetNumWords() >= minWords)
                                tb.SetIsContent(true);
                        }
                    }
                }

                foreach (var tb in textBlocks.Skip(n))
                {
                    int tl = tb.GetTagLevel();
                    if (tl < level)
                    {
                        break;
                    }
                    else
                    {
                        if (tl == level)
                        {
                            if (tb.GetNumWords() >= minWords)
                                tb.SetIsContent(true);
                        }
                    }
                }
			}
			return true;
		}
 /// <summary>
 /// Extracts text from the given
 /// <see cref="NBoilerpipePortable.Document.TextDocument">NBoilerpipePortable.Document.TextDocument</see>
 /// object.
 /// </summary>
 /// <param name="doc">
 /// The
 /// <see cref="NBoilerpipePortable.Document.TextDocument">NBoilerpipePortable.Document.TextDocument</see>
 /// .
 /// </param>
 /// <returns>The extracted text.</returns>
 /// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException">NBoilerpipePortable.BoilerpipeProcessingException
 /// 	</exception>
 public virtual string GetText(TextDocument doc)
 {
     Process(doc);
     return doc.GetContent();
 }
 /// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     if (potentialTitles == null)
     {
         return false;
     }
     bool changes = false;
     foreach (TextBlock tb in doc.GetTextBlocks())
     {
         string text = tb.GetText();
         text = text.Replace('\u00a0', ' ');
         text = text.Replace("'", "");
         text = text.Trim().ToLower();
         foreach (string candidate in potentialTitles)
         {
             if (candidate.Equals(text))
             {
                 tb.AddLabel(DefaultLabels.TITLE);
                 changes = true;
             }
         }
     }
     return changes;
 }
 public abstract bool Process(TextDocument arg1);
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public override bool Process(TextDocument doc)
		{
			return MarkEverythingContentFilter.INSTANCE.Process(doc);
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public override bool Process(TextDocument doc)
		{
			return NumWordsRulesClassifier.INSTANCE.Process(doc) | BlockProximityFusion.MAX_DISTANCE_1
				.Process(doc) | KeepLargestBlockFilter.INSTANCE.Process(doc);
		}
Example #23
0
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public override bool Process(TextDocument doc)
		{
			return CLASSIFIER.Process(doc);
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public override bool Process(TextDocument doc)
		{
			return SimpleBlockFusionProcessor.INSTANCE.Process(doc) | MarkEverythingContentFilter
				.INSTANCE.Process(doc) | filter.Process(doc);
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public override bool Process (TextDocument doc)
		{
			return SimpleBlockFusionProcessor.INSTANCE.Process (doc) 
				   | BlockProximityFusion.MAX_DISTANCE_1.Process (doc) 
				   | DensityRulesClassifier.INSTANCE.Process (doc);
		}
		/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			IList<TextBlock> textBlocks = doc.GetTextBlocks();
			if (textBlocks.Count < 2)
			{
				return false;
			}
			bool changes = false;
			TextBlock prevBlock;
			int offset;
			if (contentOnly)
			{
				prevBlock = null;
				offset = 0;
				foreach (TextBlock tb in textBlocks)
				{
					offset++;
					if (tb.IsContent())
					{
						prevBlock = tb;
						break;
					}
				}
				if (prevBlock == null)
				{
					return false;
				}
			}
			else
			{
				prevBlock = textBlocks[0];
				offset = 1;
			}
            List<TextBlock> removalList = new List<TextBlock>();
			foreach(var block in textBlocks.Skip(offset))
			{
				if (!block.IsContent())
				{
					prevBlock = block;
					continue;
				}
				int diffBlocks = block.GetOffsetBlocksStart() - prevBlock.GetOffsetBlocksEnd() - 1;
				if (diffBlocks <= maxBlocksDistance)
				{
					bool ok = true;
					if (contentOnly)
					{
						if (!prevBlock.IsContent() || !block.IsContent())
						{
							ok = false;
						}
					}
					if (ok && sameTagLevelOnly && prevBlock.GetTagLevel() != block.GetTagLevel())
					{
						ok = false;
					}
					if (ok)
					{
						prevBlock.MergeNext(block);
                        removalList.Add(block);
						changes = true;
					}
					else
					{
						prevBlock = block;
					}
				}
				else
				{
					prevBlock = block;
				}
			}
            foreach (var removal in removalList)
                textBlocks.Remove(removal);

			return changes;
		}