public bool Process(TextDocument doc) { bool changes = false; List<TextBlock> blocks = doc.TextBlocks; var blocksNew = new List<TextBlock>(); foreach (TextBlock tb in blocks) { string text = tb.Text; string[] paragraphs = Regex.Split(text, "[\n\r]+"); if (paragraphs.Length < 2) { blocksNew.Add(tb); continue; } bool isContent = tb.IsContent; List<string> labels = (tb.Labels ?? Enumerable.Empty<string>()).ToList(); foreach (String p in paragraphs) { var tbP = new TextBlock(p) { IsContent = isContent }; tbP.AddLabels(labels); blocksNew.Add(tbP); changes = true; } } if (changes) { blocks.Clear(); blocks.AddRange(blocksNew); } return changes; }
private static bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent = (curr.LinkDensity > 0 && next.NumWords > 11) || (curr.NumWords > 19 || (next.NumWords > 6 && next.LinkDensity == 0 && prev.LinkDensity == 0 && (curr.NumWords > 6 || prev.NumWords > 7 || next.NumWords > 19))); if (curr.IsContent != isContent) { curr.IsContent = isContent; return true; } return false; }
protected bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent; if (curr.LinkDensity <= 0.333333) { if (prev.LinkDensity <= 0.555556) { if (curr.TextDensity <= 9) { if (next.TextDensity <= 10) { if (prev.TextDensity <= 4) { isContent = false; } else { isContent = true; } } else { isContent = true; } } else { if (next.TextDensity == 0) { isContent = false; } else { isContent = true; } } } else { if (next.TextDensity <= 11) { isContent = false; } else { isContent = true; } } } else { isContent = false; } if (curr.IsContent != isContent) { curr.IsContent = isContent; return true; } return false; }
protected bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent; if (curr.LinkDensity <= 0.333333) { if (prev.LinkDensity <= 0.555556) { if (curr.NumWords <= 16) { if (next.NumWords <= 15) { if (prev.NumWords <= 4) { isContent = false; } else { isContent = true; } } else { isContent = true; } } else { isContent = true; } } else { if (curr.NumWords <= 40) { if (next.NumWords <= 17) { isContent = false; } else { isContent = true; } } else { isContent = true; } } } else { isContent = false; } if (curr.IsContent != isContent) { curr.IsContent = isContent; return true; } return false; }
protected static int GetNumFullTextWords(TextBlock tb) { return GetNumFullTextWords(tb, 9); }
protected static int GetNumFullTextWords(TextBlock tb, float minTextDensity) { if (tb.TextDensity >= minTextDensity) { return tb.NumWords; } return 0; }
/// <summary> /// Merges specified <see cref="TextBlock"/> with this <see cref="TextBlock"/>. /// </summary> /// <param name="other">Then <see cref="TextBlock"/> to merge with.</param> public void MergeNext(TextBlock other) { StringBuilder sb = _text; sb.Append('\n'); sb.Append(other._text); _numWords += other._numWords; _numWordsInAnchorText += other._numWordsInAnchorText; _numWordsInWrappedLines += other._numWordsInWrappedLines; _numWrappedLines += other._numWrappedLines; _offsetBlocksStart = Math.Min(_offsetBlocksStart, other._offsetBlocksStart); _offsetBlocksEnd = Math.Max(_offsetBlocksEnd, other._offsetBlocksEnd); InitDensities(); IsContent |= other.IsContent; if (_containedTextElements == null) { _containedTextElements = (BitArray)other._containedTextElements.Clone(); } else { _containedTextElements.Or(other._containedTextElements); } _numFullTextWords += other._numFullTextWords; if (other._labels != null) { if (_labels == null) { _labels = new List<String>(other._labels); } else { _labels.AddRange(other._labels); } } _tagLevel = Math.Min(_tagLevel, other._tagLevel); }
/// <summary> /// Adds the labels to the <see cref="TextBlock" /> if the condition is met. /// </summary> /// <param name="block">The <see cref="TextBlock" /> to add the labels to.</param> public override void AddTo(TextBlock block) { if (_condition.MeetsCondition(block)) { AddLabelsTo(block); } }
protected virtual void AddLabelsTo(TextBlock tb) { tb.AddLabels(Labels); }
public virtual void AddTo(TextBlock block) { AddLabelsTo(block); }