Пример #1
0
        public /*protected*/ override void ProcessDocument(Document document)
        {
            string contentType = document.Features.GetFeatureValue("contentType");

            if (contentType != "Html")
            {
                return;
            }
            try
            {
                HtmlTokenizer      htmlTokenizer = new HtmlTokenizer(document.Text, /*stemmer=*/ null, /*decode=*/ true, /*tokenize=*/ false, /*applySkipRules=*/ true);
                int                idx           = 0;
                ArrayList <string> txtBlocks     = new ArrayList <string>();
                bool               merge         = false;
                for (HtmlTokenizer.Enumerator e = (HtmlTokenizer.Enumerator)htmlTokenizer.GetEnumerator(); e.MoveNext();)
                {
                    if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.Text)
                    {
                        string textBlock = Utils.ToOneLine(e.Current.Trim(), /*compact=*/ true);
                        if (textBlock != "")
                        {
                            if (!merge)
                            {
                                txtBlocks.Add(textBlock);
                                document.AddAnnotation(new Annotation(idx, idx + textBlock.Length - 1, "TextBlock"));
                            }
                            else
                            {
                                idx--;
                                txtBlocks.Last += " " + textBlock;
                                int oldStartIdx = document.GetAnnotationAt(document.AnnotationCount - 1).SpanStart;
                                document.RemoveAnnotationAt(document.AnnotationCount - 1);
                                document.AddAnnotation(new Annotation(oldStartIdx, idx + textBlock.Length - 1, "TextBlock"));
                            }
                            idx  += textBlock.Length + 2;
                            merge = true;
                        }
                    }
                    else
                    {
                        if (mTagKeepList.Contains(e.CurrentToken.TagName.ToLower()))
                        {
                            merge = false;
                        }
                    }
                }
                StringBuilder sb = new StringBuilder();
                foreach (string textBlock in txtBlocks)
                {
                    sb.AppendLine(textBlock);
                }
                document.Text = sb.ToString();
                document.Features.SetFeatureValue("contentType", "Text");
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessDocument", exception);
            }
        }
Пример #2
0
        public /*protected*/ override void ProcessDocument(Document document)
        {
            string contentType = document.Features.GetFeatureValue("contentType");

            if (contentType != "Html")
            {
                return;
            }
            try
            {
                HtmlTokenizer      htmlTokenizer = new HtmlTokenizer(document.Text, /*stemmer=*/ null, /*decode=*/ true, /*tokenize=*/ false, /*applySkipRules=*/ true);
                int                idx           = 0;
                ArrayList <string> txtBlocks     = new ArrayList <string>();
                bool               merge         = false;
                Stack <string>     tags          = new Stack <string>();
                for (HtmlTokenizer.Enumerator e = (HtmlTokenizer.Enumerator)htmlTokenizer.GetEnumerator(); e.MoveNext();)
                {
                    if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.Text)
                    {
                        string textBlock = Utils.ToOneLine(e.Current.Trim(), /*compact=*/ true);
                        if (textBlock != "")
                        {
                            string domPath = tags.Aggregate((x, y) => y + "/" + x);
                            bool   isLink  = tags.Contains("a");
                            if (!merge)
                            {
                                txtBlocks.Add(textBlock);
                                document.AddAnnotation(new Annotation(idx, idx + textBlock.Length - 1, "TextBlock"));
                                document.Annotations.Last.Features.SetFeatureValue("domPath", domPath);
                                document.Annotations.Last.Features.SetFeatureValue("linkToTextRatio", string.Format("{0}/{1}", isLink ? textBlock.Length : 0, textBlock.Length));
                            }
                            else
                            {
                                idx--;
                                txtBlocks.Last += " " + textBlock;
                                int    oldStartIdx        = document.GetAnnotationAt(document.AnnotationCount - 1).SpanStart;
                                string oldDomPath         = document.Annotations.Last.Features.GetFeatureValue("domPath");
                                string oldLinkToTextRatio = document.Annotations.Last.Features.GetFeatureValue("linkToTextRatio");
                                document.RemoveAnnotationAt(document.AnnotationCount - 1);
                                document.AddAnnotation(new Annotation(oldStartIdx, idx + textBlock.Length - 1, "TextBlock"));
                                document.Annotations.Last.Features.SetFeatureValue("domPath", domPath.Length < oldDomPath.Length ? domPath : oldDomPath);
                                int linkCharCount = Convert.ToInt32(oldLinkToTextRatio.Split('/')[0]) + (isLink ? textBlock.Length : 0);
                                int textCharCount = Convert.ToInt32(oldLinkToTextRatio.Split('/')[1]) + textBlock.Length;
                                document.Annotations.Last.Features.SetFeatureValue("linkToTextRatio", string.Format("{0}/{1}", linkCharCount, textCharCount));
                            }
                            idx  += textBlock.Length + 2;
                            merge = true;
                        }
                    }
                    else
                    {
                        string tagName = e.CurrentToken.TagName.ToLower();
                        if (mSplitTags.Contains(tagName))
                        {
                            merge = false;
                        }
                        if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.StartTag)
                        {
                            tags.Push(tagName);
                        }
                        else if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.EndTag)
                        {
                            string endTagName = null;
                            if (tags.Count == 0 || (endTagName = tags.Pop()) != tagName)
                            {
                                mLogger.Error("ProcessDocument", "End tag does not match start tag (found {0} instead of {1}).", endTagName == null ? "nothing" : endTagName, tagName);
                                tags.Push(endTagName);
                            }
                        }
                    }
                }
                StringBuilder sb = new StringBuilder();
                foreach (string textBlock in txtBlocks)
                {
                    sb.AppendLine(textBlock);
                }
                document.Text = sb.ToString();
                document.Features.SetFeatureValue("contentType", "Text");
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessDocument", exception);
            }
        }