/// <summary> /// Convert the given digit token into (word) Items in the WordRelation. /// </summary> /// <param name="tokenVal">The digit string.</param> private void digitsToWords(string tokenVal) { FeatureSet featureSet = tokenItem.getFeatures(); string nsw = ""; if (featureSet.isPresent("nsw")) { nsw = featureSet.getString("nsw"); } if (nsw.Equals("nide")) { NumberExpander.expandID(tokenVal, wordRelation); } else { string rName = featureSet.getString("name"); string digitsType = null; if (tokenVal.Equals(rName)) { digitsType = (string)cart.interpret(tokenItem); } else { featureSet.setString("name", tokenVal); digitsType = (string)cart.interpret(tokenItem); featureSet.setString("name", rName); } if (digitsType.Equals("ordinal")) { NumberExpander.expandOrdinal(tokenVal, wordRelation); } else if (digitsType.Equals("digits")) { NumberExpander.expandDigits(tokenVal, wordRelation); } else if (digitsType.Equals("year")) { NumberExpander.expandID(tokenVal, wordRelation); } else { NumberExpander.expandNumber(tokenVal, wordRelation); } } }
/// <summary> /// process the utterance /// </summary> /// <param name="text">The text.</param> /// <exception cref="IllegalStateException"></exception> /// <returns>The utterance contain the tokens</returns> public virtual List <string> expand(string text) { string simplifiedText = simplifyChars(text); CharTokenizer tokenizer = new CharTokenizer(); tokenizer.setWhitespaceSymbols(UsEnglish.WHITESPACE_SYMBOLS); tokenizer.setSingleCharSymbols(UsEnglish.SINGLE_CHAR_SYMBOLS); tokenizer.setPrepunctuationSymbols(UsEnglish.PREPUNCTUATION_SYMBOLS); tokenizer.setPostpunctuationSymbols(UsEnglish.PUNCTUATION_SYMBOLS); tokenizer.setInputText(simplifiedText); Utterance utterance = new Utterance(tokenizer); Relation tokenRelation; if ((tokenRelation = utterance.getRelation(Relation.TOKEN)) == null) { throw new IllegalStateException("token relation does not exist"); } wordRelation = WordRelation.createWordRelation(utterance, this); for (tokenItem = tokenRelation.getHead(); tokenItem != null; tokenItem = tokenItem.getNext()) { FeatureSet featureSet = tokenItem.getFeatures(); string tokenVal = featureSet.getString("name"); // convert the token into a list of words tokenToWords(tokenVal); } List <string> words = new List <string>(); for (Item item = utterance.getRelation(Relation.WORD).getHead(); item != null; item = item.getNext()) { if (!string.IsNullOrEmpty(item.ToString()) && !item.ToString().Contains("#")) { words.Add(item.ToString()); } } return(words); }
public static void expandOrdinal(string rawNumberString, WordRelation wordRelation) { expandNumber(rawNumberString.Replace(",", ""), wordRelation); // get the last in the list of number strings Item lastItem = wordRelation.getTail(); if (lastItem != null) { FeatureSet featureSet = lastItem.getFeatures(); string lastNumber = featureSet.getString("name"); string ordinal = findMatchInArray(lastNumber, digit2num, ord2num); if (ordinal == null) { ordinal = findMatchInArray(lastNumber, digit2teen, ord2teen); } if (ordinal == null) { ordinal = findMatchInArray(lastNumber, digit2enty, ord2enty); } if (lastNumber.Equals("hundred")) { ordinal = "hundredth"; } else if (lastNumber.Equals("thousand")) { ordinal = "thousandth"; } else if (lastNumber.Equals("billion")) { ordinal = "billionth"; } // if there was an ordinal, set the last element of the list // to that ordinal; otherwise, don't do anything if (ordinal != null) { wordRelation.setLastWord(ordinal); } } }
/// <summary> /// Returns true if the given token is the name of a US state. If it is, it /// will add the name of the state to (word) Items in the WordRelation. /// </summary> /// <param name="tokenVal">The token string.</param> private bool isStateName([In] string tokenVal) { string[] state = usStatesMap.get(tokenVal); if (state != null) { bool expandState = false; // check to see if the state initials are ambiguous // in the English language if (state[1].Equals("ambiguous")) { string previous = (string)tokenItem.findFeature("p.name"); string next = (string)tokenItem.findFeature("n.name"); int nextLength = next.Length; FeatureSet featureSet = tokenItem.getFeatures(); // check if the previous word starts with a capital letter, // is at least 3 letters long, is an alphabet sequence, // and has a comma. bool previousIsCity = (char.IsUpper(previous[0]) && previous.Length > 2 && matches(alphabetPattern, previous) && tokenItem .findFeature("p.punc").Equals(",")); // check if next token starts with a lower case, or // this is the end of sentence, or if next token // is a period (".") or a zip code (5 or 10 digits). bool nextIsGood = (char.IsLower(next[0])) || tokenItem.getNext() == null || featureSet.getString("punc").Equals(".") || ((nextLength == 5 || nextLength == 10) && matches( digitsPattern, next)); if (previousIsCity && nextIsGood) { expandState = true; } else { expandState = false; } } else { expandState = true; } if (expandState) { for (int j = 2; j < state.Length; j++) { if (state[j] != null) { wordRelation.addWord(state[j]); } } return(true); } } return(false); }
/// <summary> /// Converts the given string containing "St" and "Dr" to (word) Items in the WordRelation. /// </summary> /// <param name="drStString">The string with "St" and "Dr".</param> private void drStToWords(string drStString) { string street = null; string saint = null; char c0 = drStString[0]; if (c0 == 's' || c0 == 'S') { street = "street"; saint = "saint"; } else { street = "drive"; saint = "doctor"; } FeatureSet featureSet = tokenItem.getFeatures(); string punctuation = featureSet.getString("punc"); string featPunctuation = (string)tokenItem.findFeature("punc"); if (tokenItem.getNext() == null || punctuation.IndexOf(',') != -1) { wordRelation.addWord(street); } else if (featPunctuation.Equals(",")) { wordRelation.addWord(saint); } else { string pName = (string)tokenItem.findFeature("p.name"); string nName = (string)tokenItem.findFeature("n.name"); char p0 = pName[0]; char n0 = nName[0]; if (char.IsUpper(p0) && char.IsLower(n0)) { wordRelation.addWord(street); } else if (char.IsDigit(p0) && char.IsLower(n0)) { wordRelation.addWord(street); } else if (char.IsLower(p0) && char.IsUpper(n0)) { wordRelation.addWord(saint); } else { string whitespace = (string)tokenItem.findFeature("n.whitespace"); if (whitespace.Equals(" ")) { wordRelation.addWord(saint); } else { wordRelation.addWord(street); } } } if (punctuation != null && punctuation.Equals(".")) { featureSet.setString("punc", ""); } }
/// <summary> /// Converts the given Token into (word) Items in the WordRelation. /// </summary> /// <param name="tokenVal">the string value of the token, which may or may not be /// same as the one in called "name" in flite</param> private void tokenToWords(string tokenVal) { FeatureSet tokenFeatures = tokenItem.getFeatures(); string itemName = tokenFeatures.getString("name"); int tokenLength = tokenVal.Length; if (tokenFeatures.isPresent("phones")) { wordRelation.addWord(tokenVal); } else if ((tokenVal.Equals("a") || tokenVal.Equals("A")) && ((tokenItem.getNext() == null) || !(tokenVal.Equals(itemName)) || !(((string)tokenItem .findFeature("punc")).Equals("")))) { /* if A is a sub part of a token, then its ey not ah */ wordRelation.addWord("_a"); } else if (matches(alphabetPattern, tokenVal)) { if (matches(romanNumbersPattern, tokenVal)) { /* XVIII */ romanToWords(tokenVal); } else if (matches(illionPattern, tokenVal) && matches(usMoneyPattern, (string)tokenItem.findFeature("p.name"))) { /* $ X -illion */ wordRelation.addWord(tokenVal); wordRelation.addWord("dollars"); } else if (matches(drStPattern, tokenVal)) { /* St Andrew's St, Dr King Dr */ drStToWords(tokenVal); } else if (tokenVal.Equals("Mr")) { tokenItem.getFeatures().setString("punc", ""); wordRelation.addWord("mister"); } else if (tokenVal.Equals("Mrs")) { tokenItem.getFeatures().setString("punc", ""); wordRelation.addWord("missus"); } else if (tokenLength == 1 && char.IsUpper(tokenVal[0]) && ((string)tokenItem.findFeature("n.whitespace")) .Equals(" ") && char.IsUpper(((string)tokenItem .findFeature("n.name"))[0])) { tokenFeatures.setString("punc", ""); string aaa = tokenVal.ToLower(); if (aaa.Equals("a")) { wordRelation.addWord("_a"); } else { wordRelation.addWord(aaa); } } else if (isStateName(tokenVal)) { /* * The name of a US state isStateName() has already added the * full name of the state, so we're all set. */ } else if (tokenLength > 1 && !isPronounceable(tokenVal)) { /* Need common exception list */ /* unpronouncable list of alphas */ NumberExpander.expandLetters(tokenVal, wordRelation); } else { /* just a word */ wordRelation.addWord(tokenVal.ToLower()); } } else if (matches(dottedAbbrevPattern, tokenVal)) { /* U.S.A. */ // remove all dots NumberExpander.expandLetters(tokenVal.Replace(".", ""), wordRelation); } else if (matches(commaIntPattern, tokenVal)) { /* 99,999,999 */ NumberExpander.expandReal(tokenVal.Replace(",", "").Replace("'", ""), wordRelation); } else if (matches(sevenPhoneNumberPattern, tokenVal)) { /* 234-3434 telephone numbers */ int dashIndex = tokenVal.IndexOf('-'); string aaa = tokenVal.Substring(0, dashIndex); string bbb = tokenVal.Substring(dashIndex + 1); NumberExpander.expandDigits(aaa, wordRelation); wordRelation.addBreak(); NumberExpander.expandDigits(bbb, wordRelation); } else if (matchesPartPhoneNumber(tokenVal)) { /* part of a telephone number */ string punctuation = (string)tokenItem.findFeature("punc"); if (punctuation.Equals("")) { tokenItem.getFeatures().setString("punc", ","); } NumberExpander.expandDigits(tokenVal, wordRelation); wordRelation.addBreak(); } else if (matches(numberTimePattern, tokenVal)) { /* 12:35 */ int colonIndex = tokenVal.IndexOf(':'); string aaa = tokenVal.Substring(0, colonIndex); string bbb = tokenVal.Substring(colonIndex + 1); NumberExpander.expandNumber(aaa, wordRelation); if (!(bbb.Equals("00"))) { NumberExpander.expandID(bbb, wordRelation); } } else if (matches(digits2DashPattern, tokenVal)) { /* 999-999-999 */ digitsDashToWords(tokenVal); } else if (matches(digitsPattern, tokenVal)) { digitsToWords(tokenVal); } else if (tokenLength == 1 && char.IsUpper(tokenVal[0]) && ((string)tokenItem.findFeature("n.whitespace")) .Equals(" ") && char.IsUpper(((string)tokenItem .findFeature("n.name"))[0])) { tokenFeatures.setString("punc", ""); string aaa = tokenVal.ToLower(); if (aaa.Equals("a")) { wordRelation.addWord("_a"); } else { wordRelation.addWord(aaa); } } else if (matches(doublePattern, tokenVal)) { NumberExpander.expandReal(tokenVal, wordRelation); } else if (matches(ordinalPattern, tokenVal)) { /* explicit ordinals */ string aaa = tokenVal.Substring(0, tokenLength - 2); NumberExpander.expandOrdinal(aaa, wordRelation); } else if (matches(usMoneyPattern, tokenVal)) { /* US money */ usMoneyToWords(tokenVal); } else if (tokenLength > 0 && tokenVal[tokenLength - 1] == '%') { /* Y% */ tokenToWords(tokenVal.Substring(0, tokenLength - 1)); wordRelation.addWord("percent"); } else if (matches(numessPattern, tokenVal)) { NumberExpander.expandNumess(tokenVal.Substring(0, tokenLength - 1), wordRelation); } else if (matches(digitsSlashDigitsPattern, tokenVal) && tokenVal.Equals(itemName)) { digitsSlashDigitsToWords(tokenVal); } else if (tokenVal.IndexOf('-') != -1) { dashToWords(tokenVal); } else if (tokenLength > 1 && !matches(alphabetPattern, tokenVal)) { notJustAlphasToWords(tokenVal); } else if (tokenVal.Equals("&")) { // & wordRelation.addWord("and"); } else if (tokenVal.Equals("-")) { // Skip it } else { // Just a word. wordRelation.addWord(tokenVal.ToLower()); } }