public virtual void AddToken(AceToken t) { mTokens.Add(t); }
/// <param name="filenamePrefix">path to an ACE .sgm file (but not including the .sgm extension)</param> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> public static IList <IList <AceToken> > TokenizeAndSegmentSentences(string filenamePrefix) { IList <IList <AceToken> > sentences = new List <IList <AceToken> >(); File inputFile = new File(filenamePrefix + AceDocument.OrigExt); string input = IOUtils.SlurpFile(inputFile); // now we can split the text into tokens RobustTokenizer <Word> tokenizer = new RobustTokenizer <Word>(input); IList <RobustTokenizer.WordToken> tokenList = tokenizer.TokenizeToWordTokens(); // and group the tokens into sentences List <AceToken> currentSentence = new List <AceToken>(); int quoteCount = 0; for (int i = 0; i < tokenList.Count; i++) { RobustTokenizer.WordToken token = tokenList[i]; string tokenText = token.GetWord(); AceToken convertedToken = WordTokenToAceToken(token, sentences.Count); // start a new sentence if we skipped 2+ lines (after datelines, etc.) // or we hit some SGML // if (token.getNewLineCount() > 1 || AceToken.isSgml(tokenText)) { if (AceToken.IsSgml(tokenText)) { if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } currentSentence.Add(convertedToken); if (tokenText.Equals("\"")) { quoteCount++; } // start a new sentence whenever we hit sentence-final punctuation if (sentenceFinalPuncSet.Contains(tokenText)) { // include quotes after EOS if (i < tokenList.Count - 1 && quoteCount % 2 == 1 && tokenList[i + 1].GetWord().Equals("\"")) { AceToken quoteToken = WordTokenToAceToken(tokenList[i + 1], sentences.Count); currentSentence.Add(quoteToken); quoteCount++; i++; } if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } else { // start a new sentence when we hit an SGML tag if (AceToken.IsSgml(tokenText)) { if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } } } return(sentences); }