/// <summary> /// Convert the given digit token into (word) Items in the WordRelation. /// </summary> /// <param name="tokenVal">The digit string.</param> private void DigitsToWords(string tokenVal) { FeatureSet featureSet = _tokenItem.Features; string nsw = ""; if (featureSet.IsPresent("nsw")) { nsw = featureSet.GetString("nsw"); } if (nsw.Equals("nide")) { NumberExpander.ExpandId(tokenVal, _wordRelation); } else { string rName = featureSet.GetString("name"); string digitsType = null; if (tokenVal.Equals(rName)) { digitsType = (string)_cart.Interpret(_tokenItem); } else { featureSet.SetString("name", tokenVal); digitsType = (string)_cart.Interpret(_tokenItem); featureSet.SetString("name", rName); } if (digitsType.Equals("ordinal")) { NumberExpander.ExpandOrdinal(tokenVal, _wordRelation); } else if (digitsType.Equals("digits")) { NumberExpander.ExpandDigits(tokenVal, _wordRelation); } else if (digitsType.Equals("year")) { NumberExpander.ExpandId(tokenVal, _wordRelation); } else { NumberExpander.ExpandNumber(tokenVal, _wordRelation); } } }
/// <summary> /// process the utterance /// </summary> /// <param name="text">The text.</param> /// <exception cref="IllegalStateException"></exception> /// <returns>The utterance contain the tokens</returns> public virtual List <string> Expand(string text) { string simplifiedText = SimplifyChars(text); CharTokenizer tokenizer = new CharTokenizer(); tokenizer.WhitespaceSymbols = UsEnglish.WhitespaceSymbols; tokenizer.SingleCharSymbols = UsEnglish.SingleCharSymbols; tokenizer.PrepunctuationSymbols = UsEnglish.PrePunctuationSymbols; tokenizer.PostpunctuationSymbols = UsEnglish.PunctuationSymbols; tokenizer.SetInputText(simplifiedText); Utterance utterance = new Utterance(tokenizer); Relation tokenRelation; if ((tokenRelation = utterance.GetRelation(Relation.Token)) == null) { throw new IllegalStateException("token relation does not exist"); } _wordRelation = WordRelation.CreateWordRelation(utterance, this); for (_tokenItem = tokenRelation.Head; _tokenItem != null; _tokenItem = _tokenItem.GetNext()) { FeatureSet featureSet = _tokenItem.Features; string tokenVal = featureSet.GetString("name"); // convert the token into a list of words TokenToWords(tokenVal); } List <string> words = new List <string>(); for (Item item = utterance.GetRelation(Relation.Word).Head; item != null; item = item.GetNext()) { if (!string.IsNullOrEmpty(item.ToString()) && !item.ToString().Contains("#")) { words.Add(item.ToString()); } } return(words); }
/// <summary> /// Returns true if the given token is the name of a US state. If it is, it /// will add the name of the state to (word) Items in the WordRelation. /// </summary> /// <param name="tokenVal">The token string.</param> private bool IsStateName([In] string tokenVal) { string[] state = UsStatesMap.Get(tokenVal); if (state != null) { bool expandState = false; // check to see if the state initials are ambiguous // in the English language if (state[1].Equals("ambiguous")) { string previous = (string)_tokenItem.FindFeature("p.name"); string next = (string)_tokenItem.FindFeature("n.name"); int nextLength = next.Length; FeatureSet featureSet = _tokenItem.Features; // check if the previous word starts with a capital letter, // is at least 3 letters long, is an alphabet sequence, // and has a comma. bool previousIsCity = (char.IsUpper(previous[0]) && previous.Length > 2 && Matches(AlphabetPattern, previous) && _tokenItem .FindFeature("p.punc").Equals(",")); // check if next token starts with a lower case, or // this is the end of sentence, or if next token // is a period (".") or a zip code (5 or 10 digits). bool nextIsGood = (char.IsLower(next[0])) || _tokenItem.GetNext() == null || featureSet.GetString("punc").Equals(".") || ((nextLength == 5 || nextLength == 10) && Matches( DigitsPattern, next)); if (previousIsCity && nextIsGood) { expandState = true; } else { expandState = false; } } else { expandState = true; } if (expandState) { for (int j = 2; j < state.Length; j++) { if (state[j] != null) { _wordRelation.AddWord(state[j]); } } return(true); } } return(false); }
/// <summary> /// Converts the given string containing "St" and "Dr" to (word) Items in the WordRelation. /// </summary> /// <param name="drStString">The string with "St" and "Dr".</param> private void DrStToWords(string drStString) { string street = null; string saint = null; char c0 = drStString[0]; if (c0 == 's' || c0 == 'S') { street = "street"; saint = "saint"; } else { street = "drive"; saint = "doctor"; } FeatureSet featureSet = _tokenItem.Features; string punctuation = featureSet.GetString("punc"); string featPunctuation = (string)_tokenItem.FindFeature("punc"); if (_tokenItem.GetNext() == null || punctuation.IndexOf(',') != -1) { _wordRelation.AddWord(street); } else if (featPunctuation.Equals(",")) { _wordRelation.AddWord(saint); } else { string pName = (string)_tokenItem.FindFeature("p.name"); string nName = (string)_tokenItem.FindFeature("n.name"); char p0 = pName[0]; char n0 = nName[0]; if (char.IsUpper(p0) && char.IsLower(n0)) { _wordRelation.AddWord(street); } else if (char.IsDigit(p0) && char.IsLower(n0)) { _wordRelation.AddWord(street); } else if (char.IsLower(p0) && char.IsUpper(n0)) { _wordRelation.AddWord(saint); } else { string whitespace = (string)_tokenItem.FindFeature("n.whitespace"); if (whitespace.Equals(" ")) { _wordRelation.AddWord(saint); } else { _wordRelation.AddWord(street); } } } if (punctuation != null && punctuation.Equals(".")) { featureSet.SetString("punc", ""); } }
/// <summary> /// Converts the given Token into (word) Items in the WordRelation. /// </summary> /// <param name="tokenVal">the string value of the token, which may or may not be /// same as the one in called "name" in flite</param> private void TokenToWords(string tokenVal) { FeatureSet tokenFeatures = _tokenItem.Features; string itemName = tokenFeatures.GetString("name"); int tokenLength = tokenVal.Length; if (tokenFeatures.IsPresent("phones")) { _wordRelation.AddWord(tokenVal); } else if ((tokenVal.Equals("a") || tokenVal.Equals("A")) && ((_tokenItem.GetNext() == null) || !(tokenVal.Equals(itemName)) || !(((string)_tokenItem .FindFeature("punc")).Equals("")))) { /* if A is a sub part of a token, then its ey not ah */ _wordRelation.AddWord("_a"); } else if (Matches(AlphabetPattern, tokenVal)) { if (Matches(RomanNumbersPattern, tokenVal)) { /* XVIII */ RomanToWords(tokenVal); } else if (Matches(IllionPattern, tokenVal) && Matches(UsMoneyPattern, (string)_tokenItem.FindFeature("p.name"))) { /* $ X -illion */ _wordRelation.AddWord(tokenVal); _wordRelation.AddWord("dollars"); } else if (Matches(DrStPattern, tokenVal)) { /* St Andrew's St, Dr King Dr */ DrStToWords(tokenVal); } else if (tokenVal.Equals("Mr")) { _tokenItem.Features.SetString("punc", ""); _wordRelation.AddWord("mister"); } else if (tokenVal.Equals("Mrs")) { _tokenItem.Features.SetString("punc", ""); _wordRelation.AddWord("missus"); } else if (tokenLength == 1 && char.IsUpper(tokenVal[0]) && ((string)_tokenItem.FindFeature("n.whitespace")) .Equals(" ") && char.IsUpper(((string)_tokenItem .FindFeature("n.name"))[0])) { tokenFeatures.SetString("punc", ""); string aaa = tokenVal.ToLower(); if (aaa.Equals("a")) { _wordRelation.AddWord("_a"); } else { _wordRelation.AddWord(aaa); } } else if (IsStateName(tokenVal)) { /* * The name of a US state isStateName() has already added the * full name of the state, so we're all set. */ } else if (tokenLength > 1 && !IsPronounceable(tokenVal)) { /* Need common exception list */ /* unpronouncable list of alphas */ NumberExpander.ExpandLetters(tokenVal, _wordRelation); } else { /* just a word */ _wordRelation.AddWord(tokenVal.ToLower()); } } else if (Matches(DottedAbbrevPattern, tokenVal)) { /* U.S.A. */ // remove all dots NumberExpander.ExpandLetters(tokenVal.Replace(".", ""), _wordRelation); } else if (Matches(CommaIntPattern, tokenVal)) { /* 99,999,999 */ NumberExpander.ExpandReal(tokenVal.Replace(",", "").Replace("'", ""), _wordRelation); } else if (Matches(SevenPhoneNumberPattern, tokenVal)) { /* 234-3434 telephone numbers */ int dashIndex = tokenVal.IndexOf('-'); string aaa = tokenVal.JSubString(0, dashIndex); string bbb = tokenVal.Substring(dashIndex + 1); NumberExpander.ExpandDigits(aaa, _wordRelation); _wordRelation.AddBreak(); NumberExpander.ExpandDigits(bbb, _wordRelation); } else if (MatchesPartPhoneNumber(tokenVal)) { /* part of a telephone number */ var punctuation = (string)_tokenItem.FindFeature("punc"); if (punctuation.Equals("")) { _tokenItem.Features.SetString("punc", ","); } NumberExpander.ExpandDigits(tokenVal, _wordRelation); _wordRelation.AddBreak(); } else if (Matches(NumberTimePattern, tokenVal)) { /* 12:35 */ int colonIndex = tokenVal.IndexOf(':'); string aaa = tokenVal.JSubString(0, colonIndex); string bbb = tokenVal.Substring(colonIndex + 1); NumberExpander.ExpandNumber(aaa, _wordRelation); if (!(bbb.Equals("00"))) { NumberExpander.ExpandId(bbb, _wordRelation); } } else if (Matches(Digits2DashPattern, tokenVal)) { /* 999-999-999 */ DigitsDashToWords(tokenVal); } else if (Matches(DigitsPattern, tokenVal)) { DigitsToWords(tokenVal); } else if (tokenLength == 1 && char.IsUpper(tokenVal[0]) && ((string)_tokenItem.FindFeature("n.whitespace")) .Equals(" ") && char.IsUpper(((string)_tokenItem .FindFeature("n.name"))[0])) { tokenFeatures.SetString("punc", ""); string aaa = tokenVal.ToLower(); if (aaa.Equals("a")) { _wordRelation.AddWord("_a"); } else { _wordRelation.AddWord(aaa); } } else if (Matches(DoublePattern, tokenVal)) { NumberExpander.ExpandReal(tokenVal, _wordRelation); } else if (Matches(OrdinalPattern, tokenVal)) { /* explicit ordinals */ string aaa = tokenVal.JSubString(0, tokenLength - 2); NumberExpander.ExpandOrdinal(aaa, _wordRelation); } else if (Matches(UsMoneyPattern, tokenVal)) { /* US money */ UsMoneyToWords(tokenVal); } else if (tokenLength > 0 && tokenVal[tokenLength - 1] == '%') { /* Y% */ TokenToWords(tokenVal.JSubString(0, tokenLength - 1)); _wordRelation.AddWord("percent"); } else if (Matches(NumessPattern, tokenVal)) { NumberExpander.ExpandNumess(tokenVal.JSubString(0, tokenLength - 1), _wordRelation); } else if (Matches(DigitsSlashDigitsPattern, tokenVal) && tokenVal.Equals(itemName)) { DigitsSlashDigitsToWords(tokenVal); } else if (tokenVal.IndexOf('-') != -1) { DashToWords(tokenVal); } else if (tokenLength > 1 && !Matches(AlphabetPattern, tokenVal)) { NotJustAlphasToWords(tokenVal); } else if (tokenVal.Equals("&")) { // & _wordRelation.AddWord("and"); } else if (tokenVal.Equals("-")) { // Skip it } else { // Just a word. _wordRelation.AddWord(tokenVal.ToLower()); } }