public /*protected*/ override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Html") { return; } try { HtmlTokenizer htmlTokenizer = new HtmlTokenizer(document.Text, /*stemmer=*/ null, /*decode=*/ true, /*tokenize=*/ false, /*applySkipRules=*/ true); int idx = 0; ArrayList <string> txtBlocks = new ArrayList <string>(); bool merge = false; for (HtmlTokenizer.Enumerator e = (HtmlTokenizer.Enumerator)htmlTokenizer.GetEnumerator(); e.MoveNext();) { if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.Text) { string textBlock = Utils.ToOneLine(e.Current.Trim(), /*compact=*/ true); if (textBlock != "") { if (!merge) { txtBlocks.Add(textBlock); document.AddAnnotation(new Annotation(idx, idx + textBlock.Length - 1, "TextBlock")); } else { idx--; txtBlocks.Last += " " + textBlock; int oldStartIdx = document.GetAnnotationAt(document.AnnotationCount - 1).SpanStart; document.RemoveAnnotationAt(document.AnnotationCount - 1); document.AddAnnotation(new Annotation(oldStartIdx, idx + textBlock.Length - 1, "TextBlock")); } idx += textBlock.Length + 2; merge = true; } } else { if (mTagKeepList.Contains(e.CurrentToken.TagName.ToLower())) { merge = false; } } } StringBuilder sb = new StringBuilder(); foreach (string textBlock in txtBlocks) { sb.AppendLine(textBlock); } document.Text = sb.ToString(); document.Features.SetFeatureValue("contentType", "Text"); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
public /*protected*/ override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Html") { return; } try { HtmlTokenizer htmlTokenizer = new HtmlTokenizer(document.Text, /*stemmer=*/ null, /*decode=*/ true, /*tokenize=*/ false, /*applySkipRules=*/ true); int idx = 0; ArrayList <string> txtBlocks = new ArrayList <string>(); bool merge = false; Stack <string> tags = new Stack <string>(); for (HtmlTokenizer.Enumerator e = (HtmlTokenizer.Enumerator)htmlTokenizer.GetEnumerator(); e.MoveNext();) { if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.Text) { string textBlock = Utils.ToOneLine(e.Current.Trim(), /*compact=*/ true); if (textBlock != "") { string domPath = tags.Aggregate((x, y) => y + "/" + x); bool isLink = tags.Contains("a"); if (!merge) { txtBlocks.Add(textBlock); document.AddAnnotation(new Annotation(idx, idx + textBlock.Length - 1, "TextBlock")); document.Annotations.Last.Features.SetFeatureValue("domPath", domPath); document.Annotations.Last.Features.SetFeatureValue("linkToTextRatio", string.Format("{0}/{1}", isLink ? textBlock.Length : 0, textBlock.Length)); } else { idx--; txtBlocks.Last += " " + textBlock; int oldStartIdx = document.GetAnnotationAt(document.AnnotationCount - 1).SpanStart; string oldDomPath = document.Annotations.Last.Features.GetFeatureValue("domPath"); string oldLinkToTextRatio = document.Annotations.Last.Features.GetFeatureValue("linkToTextRatio"); document.RemoveAnnotationAt(document.AnnotationCount - 1); document.AddAnnotation(new Annotation(oldStartIdx, idx + textBlock.Length - 1, "TextBlock")); document.Annotations.Last.Features.SetFeatureValue("domPath", domPath.Length < oldDomPath.Length ? domPath : oldDomPath); int linkCharCount = Convert.ToInt32(oldLinkToTextRatio.Split('/')[0]) + (isLink ? textBlock.Length : 0); int textCharCount = Convert.ToInt32(oldLinkToTextRatio.Split('/')[1]) + textBlock.Length; document.Annotations.Last.Features.SetFeatureValue("linkToTextRatio", string.Format("{0}/{1}", linkCharCount, textCharCount)); } idx += textBlock.Length + 2; merge = true; } } else { string tagName = e.CurrentToken.TagName.ToLower(); if (mSplitTags.Contains(tagName)) { merge = false; } if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.StartTag) { tags.Push(tagName); } else if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.EndTag) { string endTagName = null; if (tags.Count == 0 || (endTagName = tags.Pop()) != tagName) { mLogger.Error("ProcessDocument", "End tag does not match start tag (found {0} instead of {1}).", endTagName == null ? "nothing" : endTagName, tagName); tags.Push(endTagName); } } } } StringBuilder sb = new StringBuilder(); foreach (string textBlock in txtBlocks) { sb.AppendLine(textBlock); } document.Text = sb.ToString(); document.Features.SetFeatureValue("contentType", "Text"); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }