/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
		public bool Process(TextDocument doc)
		{
			bool changes = false;
			IList<TextBlock> blocks = doc.GetTextBlocks();
			IList<TextBlock> blocksNew = new List<TextBlock>();
			foreach (TextBlock tb in blocks)
			{
				string text = tb.GetText();
				string[] paragraphs = text.Split('[', '\n', '\r', ']', '+');
				if (paragraphs.Length < 2)
				{
					blocksNew.Add(tb);
					continue;
				}
				bool isContent = tb.IsContent();
				ICollection<string> labels = tb.GetLabels();
				foreach (string p in paragraphs)
				{
					TextBlock tbP = new TextBlock(p);
					tbP.SetIsContent(isContent);
					tbP.AddLabels(labels);
                    blocksNew.Add(tbP);
					changes = true;
				}
			}
			if (changes)
			{
				blocks.Clear();
                foreach (var block in blocksNew)
                    blocks.Add(block);
			}
			return changes;
		}
		protected internal virtual bool Classify(TextBlock prev, TextBlock curr, TextBlock
			 next)
		{
			bool isContent;
			if (curr.GetLinkDensity() <= 0.333333)
			{
				if (prev.GetLinkDensity() <= 0.555556)
				{
					if (curr.GetNumWords() <= 16)
					{
						if (next.GetNumWords() <= 15)
						{
							if (prev.GetNumWords() <= 4)
							{
								isContent = false;
							}
							else
							{
								isContent = true;
							}
						}
						else
						{
							isContent = true;
						}
					}
					else
					{
						isContent = true;
					}
				}
				else
				{
					if (curr.GetNumWords() <= 40)
					{
						if (next.GetNumWords() <= 17)
						{
							isContent = false;
						}
						else
						{
							isContent = true;
						}
					}
					else
					{
						isContent = true;
					}
				}
			}
			else
			{
				isContent = false;
			}
			return curr.SetIsContent(isContent);
		}
		protected internal static int GetNumFullTextWords(TextBlock tb, float minTextDensity
			)
		{
			if (tb.GetTextDensity() >= minTextDensity)
			{
				return tb.GetNumWords();
			}
			else
			{
				return 0;
			}
		}
		protected internal static int GetNumFullTextWords(TextBlock tb)
		{
			return GetNumFullTextWords(tb, 9);
		}
        protected void AddTextBlock (TextBlock tb)
		{
			foreach (int l in fontSizeStack) {
				tb.AddLabels ("font-" + l);
				break;
			}
			
			foreach (List<LabelAction> labels in labelStacks) {
				if (labels != null) {
					foreach (LabelAction label in labels) {
						label.AddTo (tb);
					}
				}
			}
			textBlocks.Add (tb);
		}
		public void FlushBlock ()
		{
			if (inBody == 0) {
				if (inBody == 0 && string.Compare("TITLE", lastStartTag, StringComparison.CurrentCultureIgnoreCase) == 0) 
					SetTitle (tokenBuilder.ToString ().Trim ());
				textBuilder.Length = 0;
				tokenBuilder.Length = 0;
				return;
			}

			int length = tokenBuilder.Length;
			if (length == 0) {
				return;
			} else if (length == 1) {
				if (sbLastWasWhitespace) {
					textBuilder.Length = 0;
					tokenBuilder.Length = 0;
					return;
				}
			}

			string[] tokens = UnicodeTokenizer.Tokenize (tokenBuilder);
			int numWords = 0;
			int numLinkedWords = 0;
			int numWrappedLines = 0;
			int currentLineLength = -1; // don't count the first space
			int maxLineLength = 80;
			int numTokens = 0;
			int numWordsCurrentLine = 0;

			foreach (string token in tokens) {
				if (token == ANCHOR_TEXT_START) {
					inAnchorText = true;
				} else {
					if (token == ANCHOR_TEXT_END) {
						inAnchorText = false;
					} else {
						if (IsWord (token)) {
							numTokens++;
							numWords++;
							numWordsCurrentLine++;
							
							if (inAnchorText) {
								numLinkedWords++;
							}
							int tokenLength = token.Length;
							currentLineLength += tokenLength + 1;
							if (currentLineLength > maxLineLength) {
								numWrappedLines++;
								currentLineLength = tokenLength;
								numWordsCurrentLine = 1;
							}
						} else {
							numTokens++;
						}
					}
				}
			}
			if (numTokens == 0) {
				return;
			}
			int numWordsInWrappedLines;
			if (numWrappedLines == 0) {
				numWordsInWrappedLines = numWords;
				numWrappedLines = 1;
			} else {
				numWordsInWrappedLines = numWords - numWordsCurrentLine;
			}
			TextBlock tb = new TextBlock (textBuilder.ToString ().Trim (), currentContainedTextElements
				, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks
				);
			currentContainedTextElements = new BitSet ();
			offsetBlocks++;
			textBuilder.Length = 0;
			tokenBuilder.Length = 0;
			tb.SetTagLevel (blockTagLevel);
			AddTextBlock (tb);
			blockTagLevel = -1;
		}
Example #7
0
			protected internal bool Classify(TextBlock prev, TextBlock curr, TextBlock next)
			{
				bool isContent = (curr.GetLinkDensity() > 0 && next.GetNumWords() > 11) || (curr.
					GetNumWords() > 19 || (next.GetNumWords() > 6 && next.GetLinkDensity() == 0 && prev
					.GetLinkDensity() == 0 && (curr.GetNumWords() > 6 || prev.GetNumWords() > 7 || next
					.GetNumWords() > 19)));
				return curr.SetIsContent(isContent);
			}
			public bool MeetsCondition(TextBlock tb)
			{
				return tb.GetLinkDensity() == 0 && tb.GetNumWords() > 6;
			}
Example #9
0
		public Object Clone ()
		{
			TextBlock clone = new TextBlock (text.ToString());

			if (labels != null && !labels.IsEmpty ()) {
				clone.labels = new HashSet<string> (labels);
			}
			if (containedTextElements != null) {
				clone.containedTextElements = (BitSet)containedTextElements.Clone ();
			}
			return clone;
		}