Esempio n. 1
0
        public /*protected*/ override void ProcessDocument(Document document)
        {
            string contentType = document.Features.GetFeatureValue("contentType");

            if (contentType != "Text")
            {
                return;
            }
            try
            {
                TextBlock[] textBlocks = document.GetAnnotatedBlocks(mBlockSelector);
                foreach (TextBlock textBlock in textBlocks)
                {
                    mTokenizer.Text = textBlock.Text;
                    for (RegexTokenizer.Enumerator e = (RegexTokenizer.Enumerator)mTokenizer.GetEnumerator(); e.MoveNext();)
                    {
                        document.AddAnnotation(new Annotation(textBlock.SpanStart + e.CurrentTokenIdx, textBlock.SpanStart + e.CurrentTokenIdx + e.Current.Length - 1, "Token"));
                    }
                }
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessDocument", exception);
            }
        }
Esempio n. 2
0
 protected override void ProcessDocument(Document document)
 {
     TextBlock[] textBlocks = document.GetAnnotatedBlocks(SRC_ANNOT_TYPE);
     foreach (TextBlock textBlock in textBlocks)
     {
         // do tokenization, add annotations to document
         mTokenizer.Text = textBlock.Text;
         for (RegexTokenizer.Enumerator e = (RegexTokenizer.Enumerator)mTokenizer.GetEnumerator(); e.MoveNext();)
         {
             //Console.WriteLine("{0} {1} {2}", textBlock.SpanStart + e.CurrentTokenIdx, textBlock.SpanStart + e.CurrentTokenIdx + e.Current.Length - 1, e.Current);
             Annotation annot = new Annotation(textBlock.SpanStart + e.CurrentTokenIdx, textBlock.SpanStart + e.CurrentTokenIdx + e.Current.Length - 1, DEST_ANNOT_TYPE);
             document.AddAnnotation(annot);
         }
     }
 }
Esempio n. 3
0
        private IEnumerable <Token> CreateToken(HtmlNode node, out Token endTag, RegexTokenizer textBlockTokenizer)
        {
            IEnumerable <Token> tokens = null;

            endTag = null;
            if (node.NodeType == HtmlNodeType.Element)
            {
                // case 1: open tag like <i> without </i> (inside another tag like <b><i></b>)
                if (node._innerlength <= 0 && node._outerlength <= 0)
                {
                    Token token = new Token();
                    token.mTokenType  = TokenType.OpenTag;
                    token.mStartIndex = node._outerstartindex;
                    token.mLength     = node._innerstartindex - node._outerstartindex;
                    token.mTokenStr   = mText.Substring(token.mStartIndex, token.mLength);
                    token.mTagName    = node.Name.ToLower();
                    tokens            = new Token[] { token };
                }
                // case 2: open tag like <i> without </i> (other cases)
                else if (node._innerlength <= 0 && node.EndNode == null)
                {
                    Token token = new Token();
                    token.mTokenType  = TokenType.OpenTag;
                    token.mStartIndex = node._outerstartindex;
                    token.mLength     = node._outerlength;
                    token.mTokenStr   = mText.Substring(token.mStartIndex, token.mLength);
                    token.mTagName    = node.Name.ToLower();
                    tokens            = new Token[] { token };
                }
                // case 3: empty tag like <br> or <br/>
                else if (node._innerlength <= 0)
                {
                    if (node.EndNode._outerstartindex != node._outerstartindex) // handle <tag></tag> pair
                    {
                        string startTagStr = mText.Substring(node._outerstartindex, node.EndNode._outerstartindex - node._outerstartindex);
                        Token  firstTag    = new Token();
                        firstTag.mTokenType  = TokenType.StartTag;
                        firstTag.mStartIndex = node._outerstartindex;
                        firstTag.mLength     = startTagStr.Length;
                        firstTag.mTokenStr   = startTagStr;
                        firstTag.mTagName    = node.Name.ToLower();
                        string endTagStr = mText.Substring(node.EndNode._outerstartindex, node.EndNode._outerlength);
                        Token  secondTag = new Token();
                        secondTag.mTokenType  = TokenType.EndTag;
                        secondTag.mStartIndex = firstTag.mStartIndex + firstTag.mLength;
                        secondTag.mLength     = endTagStr.Length;
                        secondTag.mTokenStr   = endTagStr;
                        secondTag.mTagName    = firstTag.mTagName;
                        tokens = new Token[] { firstTag, secondTag };
                    }
                    else // handle <tag/>
                    {
                        Token token = new Token();
                        token.mTokenType  = TokenType.EmptyTag;
                        token.mStartIndex = node._outerstartindex;
                        token.mLength     = node._outerlength;
                        token.mTokenStr   = mText.Substring(node._outerstartindex, node._outerlength);
                        token.mTagName    = node.Name.ToLower();
                        tokens            = new Token[] { token };
                    }
                }
                // case 4: closed tag like <b>some text</b>
                else
                {
                    Token token = new Token();
                    token.mTokenType   = TokenType.StartTag;
                    token.mStartIndex  = node._outerstartindex;
                    token.mLength      = node._innerstartindex - node._outerstartindex;
                    token.mTokenStr    = mText.Substring(token.mStartIndex, token.mLength);
                    token.mTagName     = node.Name.ToLower();
                    tokens             = new Token[] { token };
                    endTag             = new Token();
                    endTag.mTokenType  = TokenType.EndTag;
                    endTag.mStartIndex = node._innerstartindex + node._innerlength;
                    endTag.mLength     = node._outerstartindex + node._outerlength - endTag.mStartIndex;
                    endTag.mTokenStr   = mText.Substring(endTag.mStartIndex, endTag.mLength);
                    endTag.mTagName    = token.mTagName;
                }
            }
            else if (node.NodeType == HtmlNodeType.Text)
            {
                if (textBlockTokenizer == null)
                {
                    Token token = new Token();
                    token.mTokenType  = TokenType.Text;
                    token.mStartIndex = node._innerstartindex;
                    token.mLength     = node._innerlength;
                    token.mTokenStr   = mText.Substring(node._innerstartindex, node._innerlength);
                    if (mDecodeTextBlocks)
                    {
                        token.mTokenStr = HttpUtility.HtmlDecode(token.mTokenStr);
                    }
                    tokens = new Token[] { token };
                }
                else // tokenize text block
                {
                    tokens = new ArrayList <Token>();
                    string text = mText.Substring(node._innerstartindex, node._innerlength);
                    textBlockTokenizer.Text = mDecodeTextBlocks ? HttpUtility.HtmlDecode(text) : text;
                    RegexTokenizer.Enumerator tokEnum = (RegexTokenizer.Enumerator)textBlockTokenizer.GetEnumerator();
                    int baseIdx = node._innerstartindex;
                    while (tokEnum.MoveNext())
                    {
                        string tokenStr = tokEnum.Current;
                        Token  token    = new Token();
                        token.mTokenType = GetTokenType(tokenStr);
                        if (!mDecodeTextBlocks)
                        {
                            token.mStartIndex = baseIdx + tokEnum.CurrentTokenIdx;
                            token.mLength     = tokenStr.Length;
                        }
                        token.mTokenStr = tokenStr;
                        ((ArrayList <Token>)tokens).Add(token);
                    }
                    if (((ArrayList <Token>)tokens).Count == 0)
                    {
                        tokens = null;
                    }
                }
            }
            return(tokens);
        }