public void LoadFromText(string text) { Utils.ThrowException(text == null ? new ArgumentNullException("text") : null); mTaggedWords.Clear(); mTeiHeader = null; RegexTokenizer tokenizer = new RegexTokenizer(); tokenizer.TokenRegex = @"\p{L}+(-\p{L}+)*"; tokenizer.IgnoreUnknownTokens = false; foreach (string word in tokenizer.GetTokens(text)) { mTaggedWords.Add(new TaggedWord(word, /*tag=*/ null, /*lemma=*/ null)); } }
public /*protected*/ override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { return; } try { TextBlock[] textBlocks = document.GetAnnotatedBlocks(mBlockSelector); foreach (TextBlock textBlock in textBlocks) { for (RegexTokenizer.Enumerator e = (RegexTokenizer.Enumerator)mTokenizer.GetTokens(textBlock.Text).GetEnumerator(); e.MoveNext();) { document.AddAnnotation(new Annotation(textBlock.SpanStart + e.CurrentTokenIdx, textBlock.SpanStart + e.CurrentTokenIdx + e.Current.Length - 1, "Token")); } } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
private IEnumerable <Token> CreateToken(HtmlNode node, out Token endTag, RegexTokenizer textBlockTokenizer) { IEnumerable <Token> tokens = null; endTag = null; if (node.NodeType == HtmlNodeType.Element) { // case 1: open tag like <i> without </i> (inside another tag like <b><i></b>) if (node._innerlength <= 0 && node._outerlength <= 0) { Token token = new Token(); token.mTokenType = TokenType.OpenTag; token.mStartIndex = node._outerstartindex; token.mLength = node._innerstartindex - node._outerstartindex; token.mTokenStr = mText.Substring(token.mStartIndex, token.mLength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; } // case 2: open tag like <i> without </i> (other cases) else if (node._innerlength <= 0 && node.EndNode == null) { Token token = new Token(); token.mTokenType = TokenType.OpenTag; token.mStartIndex = node._outerstartindex; token.mLength = node._outerlength; token.mTokenStr = mText.Substring(token.mStartIndex, token.mLength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; } // case 3: empty tag like <br> or <br/> else if (node._innerlength <= 0) { if (node.EndNode._outerstartindex != node._outerstartindex) // handle <tag></tag> pair { string startTagStr = mText.Substring(node._outerstartindex, node.EndNode._outerstartindex - node._outerstartindex); Token firstTag = new Token(); firstTag.mTokenType = TokenType.StartTag; firstTag.mStartIndex = node._outerstartindex; firstTag.mLength = startTagStr.Length; firstTag.mTokenStr = startTagStr; firstTag.mTagName = node.Name.ToLower(); string endTagStr = mText.Substring(node.EndNode._outerstartindex, node.EndNode._outerlength); Token secondTag = new Token(); secondTag.mTokenType = TokenType.EndTag; secondTag.mStartIndex = firstTag.mStartIndex + firstTag.mLength; secondTag.mLength = endTagStr.Length; secondTag.mTokenStr = endTagStr; secondTag.mTagName = firstTag.mTagName; tokens = new Token[] { firstTag, secondTag }; } else // handle <tag/> { Token token = new Token(); token.mTokenType = TokenType.EmptyTag; token.mStartIndex = node._outerstartindex; token.mLength = node._outerlength; token.mTokenStr = mText.Substring(node._outerstartindex, node._outerlength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; } } // case 4: closed tag like <b>some text</b> else { Token token = new Token(); token.mTokenType = TokenType.StartTag; token.mStartIndex = node._outerstartindex; token.mLength = node._innerstartindex - node._outerstartindex; token.mTokenStr = mText.Substring(token.mStartIndex, token.mLength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; endTag = new Token(); endTag.mTokenType = TokenType.EndTag; endTag.mStartIndex = node._innerstartindex + node._innerlength; endTag.mLength = node._outerstartindex + node._outerlength - endTag.mStartIndex; endTag.mTokenStr = mText.Substring(endTag.mStartIndex, endTag.mLength); endTag.mTagName = token.mTagName; } } else if (node.NodeType == HtmlNodeType.Text) { if (textBlockTokenizer == null) { Token token = new Token(); token.mTokenType = TokenType.Text; token.mStartIndex = node._innerstartindex; token.mLength = node._innerlength; token.mTokenStr = mText.Substring(node._innerstartindex, node._innerlength); if (mDecodeTextBlocks) { token.mTokenStr = HttpUtility.HtmlDecode(token.mTokenStr); } tokens = new Token[] { token }; } else // tokenize text block { tokens = new ArrayList <Token>(); string text = mText.Substring(node._innerstartindex, node._innerlength); RegexTokenizer.Enumerator tokEnum = (RegexTokenizer.Enumerator)textBlockTokenizer.GetTokens(mDecodeTextBlocks ? HttpUtility.HtmlDecode(text) : text).GetEnumerator(); int baseIdx = node._innerstartindex; while (tokEnum.MoveNext()) { string tokenStr = tokEnum.Current; Token token = new Token(); token.mTokenType = GetTokenType(tokenStr); if (!mDecodeTextBlocks) { token.mStartIndex = baseIdx + tokEnum.CurrentTokenIdx; token.mLength = tokenStr.Length; } token.mTokenStr = tokenStr; ((ArrayList <Token>)tokens).Add(token); } if (((ArrayList <Token>)tokens).Count == 0) { tokens = null; } } } return(tokens); }