/// <summary> /// Smart tokenization storing the output in an array of CoreLabel /// Sets the following fields: /// - TextAnnotation - the text of the token /// - TokenBeginAnnotation - the byte offset of the token (start) /// - TokenEndAnnotation - the byte offset of the token (end) /// </summary> public virtual Word[] TokenizeToWords() { IList <RobustTokenizer.WordToken> toks = TokenizeToWordTokens(); Word[] labels = new Word[toks.Count]; for (int i = 0; i < toks.Count; i++) { RobustTokenizer.WordToken tok = toks[i]; Word l = new Word(tok.GetWord(), tok.GetStart(), tok.GetEnd()); labels[i] = l; } return(labels); }
/// <summary>Tokenizes a natural language string</summary> /// <returns>List of WordTokens</returns> public virtual IList <RobustTokenizer.WordToken> TokenizeToWordTokens() { IList <RobustTokenizer.WordToken> result = new List <RobustTokenizer.WordToken>(); // // replace illegal characters with SPACE // /* * StringBuffer buffer = new StringBuffer(); * for(int i = 0; i < originalString.length(); i ++){ * int c = (int) originalString.charAt(i); * // * // regular character * // * if(c > 31 && c < 127) buffer.append((char) c); * * else{ * log.info("Control character at position " + i + ": " + c); * * // * // DOS new line counts as two characters * // * if(c == 10) buffer.append(" "); * * // * // other control character * // * else buffer.append(' '); * } * } */ Matcher match = wordPattern.Matcher(buffer); int previousEndMatch = 0; // // Straight tokenization, ignoring known abbreviations // while (match.Find()) { string crtMatch = match.Group(); int endMatch = match.End(); int startMatch = endMatch - crtMatch.Length; int i; // found word ending in "n't" if (crtMatch.EndsWith("n't")) { if (crtMatch.Length > 3) { RobustTokenizer.WordToken token1 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, 0, crtMatch.Length - 3), startMatch, endMatch - 3, CountNewLines(buffer, previousEndMatch, startMatch)); result.Add(token1); } RobustTokenizer.WordToken token2 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, crtMatch.Length - 3, crtMatch.Length), endMatch - 3, endMatch, 0); result.Add(token2); } else { // found word containing an appostrophe // XXX: is this too relaxed? e.g. "O'Hare" if ((i = HasApostropheBlock(crtMatch)) != -1) { RobustTokenizer.WordToken token1 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, 0, i), startMatch, startMatch + i, CountNewLines(buffer, previousEndMatch, startMatch)); RobustTokenizer.WordToken token2 = new RobustTokenizer.WordToken(Sharpen.Runtime.Substring(crtMatch, i, crtMatch.Length), startMatch + i, endMatch, 0); result.Add(token1); result.Add(token2); } else { // just a regular word RobustTokenizer.WordToken token = new RobustTokenizer.WordToken(crtMatch, startMatch, endMatch, CountNewLines(buffer, previousEndMatch, startMatch)); result.Add(token); } } previousEndMatch = endMatch; } // // Merge known abreviations // IList <RobustTokenizer.WordToken> resultWithAbs = new List <RobustTokenizer.WordToken>(); for (int i_1 = 0; i_1 < result.Count; i_1++) { // where the mw ends int end = result.Count; if (end > i_1 + MaxMultiWordSize) { end = i_1 + MaxMultiWordSize; } bool found = false; // must have at least two tokens per multiword for (; end > i_1 + 1; end--) { RobustTokenizer.WordToken startToken = result[i_1]; RobustTokenizer.WordToken endToken = result[end - 1]; if (CountNewLines(result, i_1, end) == 0) { // abbreviation tokens cannot appear on different lines string conc = Concatenate(result, i_1, end); found = false; // found a multiword if ((mAbbreviations.Contains(conc) == true)) { found = true; RobustTokenizer.WordToken token = new RobustTokenizer.WordToken(conc, startToken.GetStart(), endToken.GetEnd(), startToken.GetNewLineCount()); resultWithAbs.Add(token); i_1 = end - 1; break; } } } // no multiword starting at this position found if (!found) { resultWithAbs.Add(result[i_1]); } } resultWithAbs = Postprocess(resultWithAbs); return(resultWithAbs); }
public static AceToken WordTokenToAceToken(RobustTokenizer.WordToken wordToken, int sentence) { return(new AceToken(wordToken.GetWord(), string.Empty, string.Empty, string.Empty, string.Empty, int.ToString(wordToken.GetStart()), int.ToString(wordToken.GetEnd()), sentence)); }
/// <param name="filenamePrefix">path to an ACE .sgm file (but not including the .sgm extension)</param> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> public static IList <IList <AceToken> > TokenizeAndSegmentSentences(string filenamePrefix) { IList <IList <AceToken> > sentences = new List <IList <AceToken> >(); File inputFile = new File(filenamePrefix + AceDocument.OrigExt); string input = IOUtils.SlurpFile(inputFile); // now we can split the text into tokens RobustTokenizer <Word> tokenizer = new RobustTokenizer <Word>(input); IList <RobustTokenizer.WordToken> tokenList = tokenizer.TokenizeToWordTokens(); // and group the tokens into sentences List <AceToken> currentSentence = new List <AceToken>(); int quoteCount = 0; for (int i = 0; i < tokenList.Count; i++) { RobustTokenizer.WordToken token = tokenList[i]; string tokenText = token.GetWord(); AceToken convertedToken = WordTokenToAceToken(token, sentences.Count); // start a new sentence if we skipped 2+ lines (after datelines, etc.) // or we hit some SGML // if (token.getNewLineCount() > 1 || AceToken.isSgml(tokenText)) { if (AceToken.IsSgml(tokenText)) { if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } currentSentence.Add(convertedToken); if (tokenText.Equals("\"")) { quoteCount++; } // start a new sentence whenever we hit sentence-final punctuation if (sentenceFinalPuncSet.Contains(tokenText)) { // include quotes after EOS if (i < tokenList.Count - 1 && quoteCount % 2 == 1 && tokenList[i + 1].GetWord().Equals("\"")) { AceToken quoteToken = WordTokenToAceToken(tokenList[i + 1], sentences.Count); currentSentence.Add(quoteToken); quoteCount++; i++; } if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } else { // start a new sentence when we hit an SGML tag if (AceToken.IsSgml(tokenText)) { if (currentSentence.Count > 0) { sentences.Add(currentSentence); } currentSentence = new List <AceToken>(); quoteCount = 0; } } } return(sentences); }