/// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception> public bool Process(TextDocument doc) { bool changes = false; IList<TextBlock> blocks = doc.GetTextBlocks(); IList<TextBlock> blocksNew = new List<TextBlock>(); foreach (TextBlock tb in blocks) { string text = tb.GetText(); string[] paragraphs = text.Split('[', '\n', '\r', ']', '+'); if (paragraphs.Length < 2) { blocksNew.Add(tb); continue; } bool isContent = tb.IsContent(); ICollection<string> labels = tb.GetLabels(); foreach (string p in paragraphs) { TextBlock tbP = new TextBlock(p); tbP.SetIsContent(isContent); tbP.AddLabels(labels); blocksNew.Add(tbP); changes = true; } } if (changes) { blocks.Clear(); foreach (var block in blocksNew) blocks.Add(block); } return changes; }
protected internal virtual bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent; if (curr.GetLinkDensity() <= 0.333333) { if (prev.GetLinkDensity() <= 0.555556) { if (curr.GetNumWords() <= 16) { if (next.GetNumWords() <= 15) { if (prev.GetNumWords() <= 4) { isContent = false; } else { isContent = true; } } else { isContent = true; } } else { isContent = true; } } else { if (curr.GetNumWords() <= 40) { if (next.GetNumWords() <= 17) { isContent = false; } else { isContent = true; } } else { isContent = true; } } } else { isContent = false; } return curr.SetIsContent(isContent); }
protected internal bool Classify(TextBlock prev, TextBlock curr, TextBlock next) { bool isContent = (curr.GetLinkDensity() > 0 && next.GetNumWords() > 11) || (curr. GetNumWords() > 19 || (next.GetNumWords() > 6 && next.GetLinkDensity() == 0 && prev .GetLinkDensity() == 0 && (curr.GetNumWords() > 6 || prev.GetNumWords() > 7 || next .GetNumWords() > 19))); return curr.SetIsContent(isContent); }