예제 #1
0
        /// <summary>
        /// Smart tokenization storing the output in an array of CoreLabel
        /// Sets the following fields:
        /// - TextAnnotation - the text of the token
        /// - TokenBeginAnnotation - the byte offset of the token (start)
        /// - TokenEndAnnotation - the byte offset of the token (end)
        /// </summary>
        public virtual Word[] TokenizeToWords()
        {
            IList <RobustTokenizer.WordToken> toks = TokenizeToWordTokens();

            Word[] labels = new Word[toks.Count];
            for (int i = 0; i < toks.Count; i++)
            {
                RobustTokenizer.WordToken tok = toks[i];
                Word l = new Word(tok.GetWord(), tok.GetStart(), tok.GetEnd());
                labels[i] = l;
            }
            return(labels);
        }
예제 #2
0
        /// <summary>Tokenizes a natural language string</summary>
        /// <returns>List of WordTokens</returns>
        public virtual IList <RobustTokenizer.WordToken> TokenizeToWordTokens()
        {
            IList <RobustTokenizer.WordToken> result = new List <RobustTokenizer.WordToken>();
            //
            // replace illegal characters with SPACE
            //

            /*
             * StringBuffer buffer = new StringBuffer();
             * for(int i = 0; i < originalString.length(); i ++){
             * int c = (int) originalString.charAt(i);
             * //
             * // regular character
             * //
             * if(c > 31 && c < 127) buffer.append((char) c);
             *
             * else{
             * log.info("Control character at position " + i + ": " + c);
             *
             * //
             * // DOS new line counts as two characters
             * //
             * if(c == 10) buffer.append(" ");
             *
             * //
             * // other control character
             * //
             * else buffer.append(' ');
             * }
             * }
             */
            Matcher match            = wordPattern.Matcher(buffer);
            int     previousEndMatch = 0;

            //
            // Straight tokenization, ignoring known abbreviations
            //
            while (match.Find())
            {
                string crtMatch   = match.Group();
                int    endMatch   = match.End();
                int    startMatch = endMatch - crtMatch.Length;
                int    i;
                // found word ending in "n't"
                if (crtMatch.EndsWith("n't"))
                {
                    if (crtMatch.Length > 3)
                    {
                        RobustTokenizer.WordToken token1 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, 0, crtMatch.Length - 3), startMatch, endMatch - 3, CountNewLines(buffer, previousEndMatch, startMatch));
                        result.Add(token1);
                    }
                    RobustTokenizer.WordToken token2 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, crtMatch.Length - 3, crtMatch.Length), endMatch - 3, endMatch, 0);
                    result.Add(token2);
                }
                else
                {
                    // found word containing an appostrophe
                    // XXX: is this too relaxed? e.g. "O'Hare"
                    if ((i = HasApostropheBlock(crtMatch)) != -1)
                    {
                        RobustTokenizer.WordToken token1 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, 0, i), startMatch, startMatch + i, CountNewLines(buffer, previousEndMatch, startMatch));
                        RobustTokenizer.WordToken token2 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, i, crtMatch.Length), startMatch + i, endMatch, 0);
                        result.Add(token1);
                        result.Add(token2);
                    }
                    else
                    {
                        // just a regular word
                        RobustTokenizer.WordToken token = new RobustTokenizer.WordToken(crtMatch, startMatch, endMatch, CountNewLines(buffer, previousEndMatch, startMatch));
                        result.Add(token);
                    }
                }
                previousEndMatch = endMatch;
            }
            //
            // Merge known abreviations
            //
            IList <RobustTokenizer.WordToken> resultWithAbs = new List <RobustTokenizer.WordToken>();

            for (int i_1 = 0; i_1 < result.Count; i_1++)
            {
                // where the mw ends
                int end = result.Count;
                if (end > i_1 + MaxMultiWordSize)
                {
                    end = i_1 + MaxMultiWordSize;
                }
                bool found = false;
                // must have at least two tokens per multiword
                for (; end > i_1 + 1; end--)
                {
                    RobustTokenizer.WordToken startToken = result[i_1];
                    RobustTokenizer.WordToken endToken   = result[end - 1];
                    if (CountNewLines(result, i_1, end) == 0)
                    {
                        // abbreviation tokens cannot appear on different lines
                        string conc = Concatenate(result, i_1, end);
                        found = false;
                        // found a multiword
                        if ((mAbbreviations.Contains(conc) == true))
                        {
                            found = true;
                            RobustTokenizer.WordToken token = new RobustTokenizer.WordToken(conc, startToken.GetStart(), endToken.GetEnd(), startToken.GetNewLineCount());
                            resultWithAbs.Add(token);
                            i_1 = end - 1;
                            break;
                        }
                    }
                }
                // no multiword starting at this position found
                if (!found)
                {
                    resultWithAbs.Add(result[i_1]);
                }
            }
            resultWithAbs = Postprocess(resultWithAbs);
            return(resultWithAbs);
        }
 public static AceToken WordTokenToAceToken(RobustTokenizer.WordToken wordToken, int sentence)
 {
     return(new AceToken(wordToken.GetWord(), string.Empty, string.Empty, string.Empty, string.Empty, int.ToString(wordToken.GetStart()), int.ToString(wordToken.GetEnd()), sentence));
 }
        /// <param name="filenamePrefix">path to an ACE .sgm file (but not including the .sgm extension)</param>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Xml.Sax.SAXException"/>
        /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/>
        public static IList <IList <AceToken> > TokenizeAndSegmentSentences(string filenamePrefix)
        {
            IList <IList <AceToken> > sentences = new List <IList <AceToken> >();
            File   inputFile = new File(filenamePrefix + AceDocument.OrigExt);
            string input     = IOUtils.SlurpFile(inputFile);
            // now we can split the text into tokens
            RobustTokenizer <Word>            tokenizer = new RobustTokenizer <Word>(input);
            IList <RobustTokenizer.WordToken> tokenList = tokenizer.TokenizeToWordTokens();
            // and group the tokens into sentences
            List <AceToken> currentSentence = new List <AceToken>();
            int             quoteCount      = 0;

            for (int i = 0; i < tokenList.Count; i++)
            {
                RobustTokenizer.WordToken token = tokenList[i];
                string   tokenText      = token.GetWord();
                AceToken convertedToken = WordTokenToAceToken(token, sentences.Count);
                // start a new sentence if we skipped 2+ lines (after datelines, etc.)
                // or we hit some SGML
                // if (token.getNewLineCount() > 1 || AceToken.isSgml(tokenText)) {
                if (AceToken.IsSgml(tokenText))
                {
                    if (currentSentence.Count > 0)
                    {
                        sentences.Add(currentSentence);
                    }
                    currentSentence = new List <AceToken>();
                    quoteCount      = 0;
                }
                currentSentence.Add(convertedToken);
                if (tokenText.Equals("\""))
                {
                    quoteCount++;
                }
                // start a new sentence whenever we hit sentence-final punctuation
                if (sentenceFinalPuncSet.Contains(tokenText))
                {
                    // include quotes after EOS
                    if (i < tokenList.Count - 1 && quoteCount % 2 == 1 && tokenList[i + 1].GetWord().Equals("\""))
                    {
                        AceToken quoteToken = WordTokenToAceToken(tokenList[i + 1], sentences.Count);
                        currentSentence.Add(quoteToken);
                        quoteCount++;
                        i++;
                    }
                    if (currentSentence.Count > 0)
                    {
                        sentences.Add(currentSentence);
                    }
                    currentSentence = new List <AceToken>();
                    quoteCount      = 0;
                }
                else
                {
                    // start a new sentence when we hit an SGML tag
                    if (AceToken.IsSgml(tokenText))
                    {
                        if (currentSentence.Count > 0)
                        {
                            sentences.Add(currentSentence);
                        }
                        currentSentence = new List <AceToken>();
                        quoteCount      = 0;
                    }
                }
            }
            return(sentences);
        }