/// <exception cref="System.Exception"/> public static void Main(string[] argv) { if (argv.Length != 1) { log.Info("Usage: java edu.stanford.nlp.ie.machinereading.common.RobustTokenizer <file to tokenize>"); System.Environment.Exit(1); } // tokenize this file BufferedReader @is = new BufferedReader(new FileReader(argv[0])); // read the whole file in a buffer // XXX: for sure there are more efficient ways of reading a file... int ch; StringBuilder buffer = new StringBuilder(); while ((ch = @is.Read()) != -1) { buffer.Append((char)ch); } // create the tokenizer object RobustTokenizer <Word> t = new RobustTokenizer <Word>(buffer.ToString()); IList <Word> tokens = t.Tokenize(); foreach (Word token in tokens) { System.Console.Out.WriteLine(token); } }
/// <param name="filenamePrefix">path to an ACE .sgm file (but not including the .sgm extension)</param> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> public static IList <IList <AceToken> > TokenizeAndSegmentSentences(string filenamePrefix) { IList <IList <AceToken> > sentences = new List <IList <AceToken> >(); File inputFile = new File(filenamePrefix + AceDocument.OrigExt); string input = IOUtils.SlurpFile(inputFile); // now we can split the text into tokens RobustTokenizer <Word> tokenizer = new RobustTokenizer <Word>(input); IList <RobustTokenizer.WordToken> tokenList = tokenizer.TokenizeToWordTokens(); // and group the tokens into sentences List <AceToken> currentSentence = new List <AceToken>(); int quoteCount = 0; for (int i = 0; i < tokenList.Count; i++) { RobustTokenizer.WordToken token = tokenList[i]; string tokenText = token.GetWord(); AceToken convertedToken = WordTokenToAceToken(token, sentences.Count); // start a new sentence if we skipped 2+ lines (after datelines, etc.) // or we hit some SGML // if (token.getNewLineCount() > 1 || AceToken.isSgml(tokenText)) { if (AceToken.IsSgml(tokenText)) { if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } currentSentence.Add(convertedToken); if (tokenText.Equals("\"")) { quoteCount++; } // start a new sentence whenever we hit sentence-final punctuation if (sentenceFinalPuncSet.Contains(tokenText)) { // include quotes after EOS if (i < tokenList.Count - 1 && quoteCount % 2 == 1 && tokenList[i + 1].GetWord().Equals("\"")) { AceToken quoteToken = WordTokenToAceToken(tokenList[i + 1], sentences.Count); currentSentence.Add(quoteToken); quoteCount++; i++; } if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } else { // start a new sentence when we hit an SGML tag if (AceToken.IsSgml(tokenText)) { if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } } } return(sentences); }