コード例 #1
0
 public virtual void AddToken(AceToken t)
 {
     mTokens.Add(t);
 }
コード例 #2
0
        /// <param name="filenamePrefix">path to an ACE .sgm file (but not including the .sgm extension)</param>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Xml.Sax.SAXException"/>
        /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/>
        public static IList <IList <AceToken> > TokenizeAndSegmentSentences(string filenamePrefix)
        {
            IList <IList <AceToken> > sentences = new List <IList <AceToken> >();
            File   inputFile = new File(filenamePrefix + AceDocument.OrigExt);
            string input     = IOUtils.SlurpFile(inputFile);
            // now we can split the text into tokens
            RobustTokenizer <Word>            tokenizer = new RobustTokenizer <Word>(input);
            IList <RobustTokenizer.WordToken> tokenList = tokenizer.TokenizeToWordTokens();
            // and group the tokens into sentences
            List <AceToken> currentSentence = new List <AceToken>();
            int             quoteCount      = 0;

            for (int i = 0; i < tokenList.Count; i++)
            {
                RobustTokenizer.WordToken token = tokenList[i];
                string   tokenText      = token.GetWord();
                AceToken convertedToken = WordTokenToAceToken(token, sentences.Count);
                // start a new sentence if we skipped 2+ lines (after datelines, etc.)
                // or we hit some SGML
                // if (token.getNewLineCount() > 1 || AceToken.isSgml(tokenText)) {
                if (AceToken.IsSgml(tokenText))
                {
                    if (currentSentence.Count > 0)
                    {
                        sentences.Add(currentSentence);
                    }
                    currentSentence = new List <AceToken>();
                    quoteCount      = 0;
                }
                currentSentence.Add(convertedToken);
                if (tokenText.Equals("\""))
                {
                    quoteCount++;
                }
                // start a new sentence whenever we hit sentence-final punctuation
                if (sentenceFinalPuncSet.Contains(tokenText))
                {
                    // include quotes after EOS
                    if (i < tokenList.Count - 1 && quoteCount % 2 == 1 && tokenList[i + 1].GetWord().Equals("\""))
                    {
                        AceToken quoteToken = WordTokenToAceToken(tokenList[i + 1], sentences.Count);
                        currentSentence.Add(quoteToken);
                        quoteCount++;
                        i++;
                    }
                    if (currentSentence.Count > 0)
                    {
                        sentences.Add(currentSentence);
                    }
                    currentSentence = new List <AceToken>();
                    quoteCount      = 0;
                }
                else
                {
                    // start a new sentence when we hit an SGML tag
                    if (AceToken.IsSgml(tokenText))
                    {
                        if (currentSentence.Count > 0)
                        {
                            sentences.Add(currentSentence);
                        }
                        currentSentence = new List <AceToken>();
                        quoteCount      = 0;
                    }
                }
            }
            return(sentences);
        }