public bool Process(TextDocument doc) {
      bool changes = false;

      List<TextBlock> blocks = doc.TextBlocks;
      var blocksNew = new List<TextBlock>();

      foreach (TextBlock tb in blocks) {
        string text = tb.Text;
        string[] paragraphs = Regex.Split(text, "[\n\r]+");
        if (paragraphs.Length < 2) {
          blocksNew.Add(tb);
          continue;
        }
        bool isContent = tb.IsContent;
        List<string> labels = (tb.Labels ?? Enumerable.Empty<string>()).ToList();
        foreach (String p in paragraphs) {
          var tbP = new TextBlock(p) { IsContent = isContent };
          tbP.AddLabels(labels);
          blocksNew.Add(tbP);
          changes = true;
        }
      }

      if (changes) {
        blocks.Clear();
        blocks.AddRange(blocksNew);
      }

      return changes;
    }
Ejemplo n.º 2
0
      private static bool Classify(TextBlock prev, TextBlock curr, TextBlock next) {
        bool isContent = (curr.LinkDensity > 0 && next.NumWords > 11)
                         || (curr.NumWords > 19
                             || (next.NumWords > 6 && next.LinkDensity == 0 && prev.LinkDensity == 0
                                 && (curr.NumWords > 6 || prev.NumWords > 7 || next.NumWords > 19)));

        if (curr.IsContent != isContent) {
          curr.IsContent = isContent;
          return true;
        }
        return false;
      }
    protected bool Classify(TextBlock prev, TextBlock curr, TextBlock next) {
      bool isContent;

      if (curr.LinkDensity <= 0.333333) {
        if (prev.LinkDensity <= 0.555556) {
          if (curr.TextDensity <= 9) {
            if (next.TextDensity <= 10) {
              if (prev.TextDensity <= 4) {
                isContent = false;
              } else {
                isContent = true;
              }
            } else {
              isContent = true;
            }
          } else {
            if (next.TextDensity == 0) {
              isContent = false;
            } else {
              isContent = true;
            }
          }
        } else {
          if (next.TextDensity <= 11) {
            isContent = false;
          } else {
            isContent = true;
          }
        }
      } else {
        isContent = false;
      }

      if (curr.IsContent != isContent) {
        curr.IsContent = isContent;
        return true;
      }
      return false;
    }
    protected bool Classify(TextBlock prev, TextBlock curr, TextBlock next) {
      bool isContent;

      if (curr.LinkDensity <= 0.333333) {
        if (prev.LinkDensity <= 0.555556) {
          if (curr.NumWords <= 16) {
            if (next.NumWords <= 15) {
              if (prev.NumWords <= 4) {
                isContent = false;
              } else {
                isContent = true;
              }
            } else {
              isContent = true;
            }
          } else {
            isContent = true;
          }
        } else {
          if (curr.NumWords <= 40) {
            if (next.NumWords <= 17) {
              isContent = false;
            } else {
              isContent = true;
            }
          } else {
            isContent = true;
          }
        }
      } else {
        isContent = false;
      }

      if (curr.IsContent != isContent) {
        curr.IsContent = isContent;
        return true;
      }
      return false;
    }
Ejemplo n.º 5
0
 protected static int GetNumFullTextWords(TextBlock tb) {
   return GetNumFullTextWords(tb, 9);
 }
Ejemplo n.º 6
0
 protected static int GetNumFullTextWords(TextBlock tb, float minTextDensity) {
   if (tb.TextDensity >= minTextDensity) {
     return tb.NumWords;
   }
   return 0;
 }
Ejemplo n.º 7
0
    /// <summary>
    /// Merges specified <see cref="TextBlock"/> with this <see cref="TextBlock"/>.
    /// </summary>
    /// <param name="other">Then <see cref="TextBlock"/> to merge with.</param>
    public void MergeNext(TextBlock other) {
      StringBuilder sb = _text;
      sb.Append('\n');
      sb.Append(other._text);

      _numWords += other._numWords;
      _numWordsInAnchorText += other._numWordsInAnchorText;

      _numWordsInWrappedLines += other._numWordsInWrappedLines;
      _numWrappedLines += other._numWrappedLines;

      _offsetBlocksStart = Math.Min(_offsetBlocksStart, other._offsetBlocksStart);
      _offsetBlocksEnd = Math.Max(_offsetBlocksEnd, other._offsetBlocksEnd);

      InitDensities();

      IsContent |= other.IsContent;

      if (_containedTextElements == null) {
        _containedTextElements = (BitArray)other._containedTextElements.Clone();
      } else {
        _containedTextElements.Or(other._containedTextElements);
      }

      _numFullTextWords += other._numFullTextWords;

      if (other._labels != null) {
        if (_labels == null) {
          _labels = new List<String>(other._labels);
        } else {
          _labels.AddRange(other._labels);
        }
      }

      _tagLevel = Math.Min(_tagLevel, other._tagLevel);
    }
 /// <summary>
 ///   Adds the labels to the <see cref="TextBlock" /> if the condition is met.
 /// </summary>
 /// <param name="block">The <see cref="TextBlock" /> to add the labels to.</param>
 public override void AddTo(TextBlock block) {
   if (_condition.MeetsCondition(block)) {
     AddLabelsTo(block);
   }
 }
Ejemplo n.º 9
0
 protected virtual void AddLabelsTo(TextBlock tb) {
   tb.AddLabels(Labels);
 }
Ejemplo n.º 10
0
 public virtual void AddTo(TextBlock block) {
   AddLabelsTo(block);
 }