Describes a block of text.
Describes a block of text. A block can be an "atomic" text element (i.e., a sequence of text that is not interrupted by any HTML markup) or a compound of such atomic elements.
Inheritance: ICloneable
 /// <exception cref="NBoilerpipe.BoilerpipeProcessingException"></exception>
 public bool Process(TextDocument doc)
 {
     bool changes = false;
     IList<TextBlock> blocks = doc.GetTextBlocks();
     IList<TextBlock> blocksNew = new AList<TextBlock>();
     foreach (TextBlock tb in blocks)
     {
         string text = tb.GetText();
         string[] paragraphs = text.Split("[\n\r]+");
         if (paragraphs.Length < 2)
         {
             blocksNew.AddItem(tb);
             continue;
         }
         bool isContent = tb.IsContent();
         ICollection<string> labels = tb.GetLabels();
         foreach (string p in paragraphs)
         {
             TextBlock tbP = new TextBlock(p);
             tbP.SetIsContent(isContent);
             tbP.AddLabels(labels);
             blocksNew.AddItem(tbP);
             changes = true;
         }
     }
     if (changes)
     {
         blocks.Clear();
         Sharpen.Collections.AddAll(blocks, blocksNew);
     }
     return changes;
 }
 public override void AddTo(TextBlock tb)
 {
     if (condition.MeetsCondition(tb))
     {
         AddLabelsTo(tb);
     }
 }
        protected internal virtual bool Classify(TextBlock prev, TextBlock curr, TextBlock
			 next)
        {
            bool isContent;
            if (curr.GetLinkDensity() <= 0.333333)
            {
                if (prev.GetLinkDensity() <= 0.555556)
                {
                    if (curr.GetTextDensity() <= 9)
                    {
                        if (next.GetTextDensity() <= 10)
                        {
                            if (prev.GetTextDensity() <= 4)
                            {
                                isContent = false;
                            }
                            else
                            {
                                isContent = true;
                            }
                        }
                        else
                        {
                            isContent = true;
                        }
                    }
                    else
                    {
                        if (next.GetTextDensity() == 0)
                        {
                            isContent = false;
                        }
                        else
                        {
                            isContent = true;
                        }
                    }
                }
                else
                {
                    if (next.GetTextDensity() <= 11)
                    {
                        isContent = false;
                    }
                    else
                    {
                        isContent = true;
                    }
                }
            }
            else
            {
                isContent = false;
            }
            return curr.SetIsContent(isContent);
        }
Example #4
0
        protected internal static int GetNumFullTextWords(TextBlock tb, float minTextDensity
			)
        {
            if (tb.GetTextDensity() >= minTextDensity)
            {
                return tb.GetNumWords();
            }
            else
            {
                return 0;
            }
        }
 public bool MeetsCondition(TextBlock tb)
 {
     return tb.GetLinkDensity() == 0 && tb.GetNumWords() > 6;
 }
Example #6
0
 protected internal static int GetNumFullTextWords(TextBlock tb)
 {
     return GetNumFullTextWords(tb, 9);
 }
        protected void AddTextBlock(TextBlock tb)
        {
            foreach (int l in fontSizeStack) {
                tb.AddLabels ("font-" + l);
                break;
            }

            foreach (List<LabelAction> labels in labelStacks) {
                if (labels != null) {
                    foreach (LabelAction label in labels) {
                        label.AddTo (tb);
                    }
                }
            }
            textBlocks.Add (tb);
        }
        public void FlushBlock()
        {
            if (inBody == 0) {
                if (inBody == 0 && Sharpen.Runtime.EqualsIgnoreCase ("TITLE", lastStartTag))
                    SetTitle (tokenBuilder.ToString ().Trim ());
                textBuilder.Length = 0;
                tokenBuilder.Length = 0;
                return;
            }

            int length = tokenBuilder.Length;
            if (length == 0) {
                return;
            } else if (length == 1) {
                if (sbLastWasWhitespace) {
                    textBuilder.Length = 0;
                    tokenBuilder.Length = 0;
                    return;
                }
            }

            string[] tokens = UnicodeTokenizer.Tokenize (tokenBuilder);
            int numWords = 0;
            int numLinkedWords = 0;
            int numWrappedLines = 0;
            int currentLineLength = -1; // don't count the first space
            int maxLineLength = 80;
            int numTokens = 0;
            int numWordsCurrentLine = 0;

            foreach (string token in tokens) {
                if (token == ANCHOR_TEXT_START) {
                    inAnchorText = true;
                } else {
                    if (token == ANCHOR_TEXT_END) {
                        inAnchorText = false;
                    } else {
                        if (IsWord (token)) {
                            numTokens++;
                            numWords++;
                            numWordsCurrentLine++;

                            if (inAnchorText) {
                                numLinkedWords++;
                            }
                            int tokenLength = token.Length;
                            currentLineLength += tokenLength + 1;
                            if (currentLineLength > maxLineLength) {
                                numWrappedLines++;
                                currentLineLength = tokenLength;
                                numWordsCurrentLine = 1;
                            }
                        } else {
                            numTokens++;
                        }
                    }
                }
            }
            if (numTokens == 0) {
                return;
            }
            int numWordsInWrappedLines;
            if (numWrappedLines == 0) {
                numWordsInWrappedLines = numWords;
                numWrappedLines = 1;
            } else {
                numWordsInWrappedLines = numWords - numWordsCurrentLine;
            }
            TextBlock tb = new TextBlock (textBuilder.ToString ().Trim (), currentContainedTextElements
                , numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks
                );
            currentContainedTextElements = new BitSet ();
            offsetBlocks++;
            textBuilder.Length = 0;
            tokenBuilder.Length = 0;
            tb.SetTagLevel (blockTagLevel);
            AddTextBlock (tb);
            blockTagLevel = -1;
        }
Example #9
0
 protected internal void AddLabelsTo(TextBlock tb)
 {
     tb.AddLabels(labels);
 }
Example #10
0
 public virtual void AddTo(TextBlock tb)
 {
     AddLabelsTo(tb);
 }
Example #11
0
 protected internal bool Classify(TextBlock prev, TextBlock curr, TextBlock next)
 {
     bool isContent = (curr.GetLinkDensity() > 0 && next.GetNumWords() > 11) || (curr.
         GetNumWords() > 19 || (next.GetNumWords() > 6 && next.GetLinkDensity() == 0 && prev
         .GetLinkDensity() == 0 && (curr.GetNumWords() > 6 || prev.GetNumWords() > 7 || next
         .GetNumWords() > 19)));
     return curr.SetIsContent(isContent);
 }
Example #12
0
        public Object Clone()
        {
            TextBlock clone = new TextBlock (text.ToString());

            if (labels != null && !labels.IsEmpty ()) {
                clone.labels = new HashSet<string> (labels);
            }
            if (containedTextElements != null) {
                clone.containedTextElements = (BitSet)containedTextElements.Clone ();
            }
            return clone;
        }