Example #1
0
        /// <summary>
        /// Smart tokenization storing the output in an array of CoreLabel
        /// Sets the following fields:
        /// - TextAnnotation - the text of the token
        /// - TokenBeginAnnotation - the byte offset of the token (start)
        /// - TokenEndAnnotation - the byte offset of the token (end)
        /// </summary>
        public virtual Word[] TokenizeToWords()
        {
            IList <RobustTokenizer.WordToken> toks = TokenizeToWordTokens();

            Word[] labels = new Word[toks.Count];
            for (int i = 0; i < toks.Count; i++)
            {
                RobustTokenizer.WordToken tok = toks[i];
                Word l = new Word(tok.GetWord(), tok.GetStart(), tok.GetEnd());
                labels[i] = l;
            }
            return(labels);
        }
Example #2
0
        /// <summary>Tokenizes a natural language string</summary>
        /// <returns>List of WordTokens</returns>
        public virtual IList <RobustTokenizer.WordToken> TokenizeToWordTokens()
        {
            IList <RobustTokenizer.WordToken> result = new List <RobustTokenizer.WordToken>();
            //
            // replace illegal characters with SPACE
            //

            /*
             * StringBuffer buffer = new StringBuffer();
             * for(int i = 0; i < originalString.length(); i ++){
             * int c = (int) originalString.charAt(i);
             * //
             * // regular character
             * //
             * if(c > 31 && c < 127) buffer.append((char) c);
             *
             * else{
             * log.info("Control character at position " + i + ": " + c);
             *
             * //
             * // DOS new line counts as two characters
             * //
             * if(c == 10) buffer.append(" ");
             *
             * //
             * // other control character
             * //
             * else buffer.append(' ');
             * }
             * }
             */
            Matcher match            = wordPattern.Matcher(buffer);
            int     previousEndMatch = 0;

            //
            // Straight tokenization, ignoring known abbreviations
            //
            while (match.Find())
            {
                string crtMatch   = match.Group();
                int    endMatch   = match.End();
                int    startMatch = endMatch - crtMatch.Length;
                int    i;
                // found word ending in "n't"
                if (crtMatch.EndsWith("n't"))
                {
                    if (crtMatch.Length > 3)
                    {
                        RobustTokenizer.WordToken token1 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, 0, crtMatch.Length - 3), startMatch, endMatch - 3, CountNewLines(buffer, previousEndMatch, startMatch));
                        result.Add(token1);
                    }
                    RobustTokenizer.WordToken token2 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, crtMatch.Length - 3, crtMatch.Length), endMatch - 3, endMatch, 0);
                    result.Add(token2);
                }
                else
                {
                    // found word containing an appostrophe
                    // XXX: is this too relaxed? e.g. "O'Hare"
                    if ((i = HasApostropheBlock(crtMatch)) != -1)
                    {
                        RobustTokenizer.WordToken token1 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, 0, i), startMatch, startMatch + i, CountNewLines(buffer, previousEndMatch, startMatch));
                        RobustTokenizer.WordToken token2 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, i, crtMatch.Length), startMatch + i, endMatch, 0);
                        result.Add(token1);
                        result.Add(token2);
                    }
                    else
                    {
                        // just a regular word
                        RobustTokenizer.WordToken token = new RobustTokenizer.WordToken(crtMatch, startMatch, endMatch, CountNewLines(buffer, previousEndMatch, startMatch));
                        result.Add(token);
                    }
                }
                previousEndMatch = endMatch;
            }
            //
            // Merge known abreviations
            //
            IList <RobustTokenizer.WordToken> resultWithAbs = new List <RobustTokenizer.WordToken>();

            for (int i_1 = 0; i_1 < result.Count; i_1++)
            {
                // where the mw ends
                int end = result.Count;
                if (end > i_1 + MaxMultiWordSize)
                {
                    end = i_1 + MaxMultiWordSize;
                }
                bool found = false;
                // must have at least two tokens per multiword
                for (; end > i_1 + 1; end--)
                {
                    RobustTokenizer.WordToken startToken = result[i_1];
                    RobustTokenizer.WordToken endToken   = result[end - 1];
                    if (CountNewLines(result, i_1, end) == 0)
                    {
                        // abbreviation tokens cannot appear on different lines
                        string conc = Concatenate(result, i_1, end);
                        found = false;
                        // found a multiword
                        if ((mAbbreviations.Contains(conc) == true))
                        {
                            found = true;
                            RobustTokenizer.WordToken token = new RobustTokenizer.WordToken(conc, startToken.GetStart(), endToken.GetEnd(), startToken.GetNewLineCount());
                            resultWithAbs.Add(token);
                            i_1 = end - 1;
                            break;
                        }
                    }
                }
                // no multiword starting at this position found
                if (!found)
                {
                    resultWithAbs.Add(result[i_1]);
                }
            }
            resultWithAbs = Postprocess(resultWithAbs);
            return(resultWithAbs);
        }
 public static AceToken WordTokenToAceToken(RobustTokenizer.WordToken wordToken, int sentence)
 {
     return(new AceToken(wordToken.GetWord(), string.Empty, string.Empty, string.Empty, string.Empty, int.ToString(wordToken.GetStart()), int.ToString(wordToken.GetEnd()), sentence));
 }