public static Recognizer Create(System.Globalization.CultureInfo culture, int priority) { try { // TODO support non-blank languages for unit separation MeasureRegexRecognizer result = new MeasureRegexRecognizer(100, "DEFAULT_MEASURE_RECOGNIZER", culture); Core.CharacterSet first = null; // augmentation doesn't change FIRST() List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first); AugmentPatterns(patterns, culture); foreach (string p in patterns) { // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway) result.Add(p, first, 2); } SeparatorCombination defaultSc = new SeparatorCombination(culture, false); if (defaultSc.IsSwappable()) { patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first); AugmentPatterns(patterns, culture); foreach (string p in patterns) { result.Add(p, first, 1); } } if (NumberPatternComputer.DoAddENUSSeparators(culture)) { patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first); AugmentPatterns(patterns, culture); foreach (string p in patterns) { result.Add(p, first, 0); } } result.OnlyIfFollowedByNonwordCharacter = true; // otherwise "123 ABC" will be recognized as "123 A" "BC" in Japanese return(result); } catch // (System.Exception e) { return(null); } }
public static Recognizer Create(System.Globalization.CultureInfo culture, int priority) { try { NumberRegexRecognizer recog = new NumberRegexRecognizer(100, "DEFAULT_NUMBER_RECOGNIZER", culture.NumberFormat); CharacterSet first = null; foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first)) { recog.Add(p, first, 2); } SeparatorCombination defaultSc = new SeparatorCombination(culture, false); if (defaultSc.IsSwappable()) { foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first)) { recog.Add(p, first, 1); } } if (NumberPatternComputer.DoAddENUSSeparators(culture)) { foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first)) { recog.Add(p, first, 1); } } recog.OnlyIfFollowedByNonwordCharacter = CultureInfoExtensions.UseBlankAsWordSeparator(culture); recog.AdditionalTerminators = new CharacterSet(); recog.AdditionalTerminators.Add('-'); // TODO other math symbols? recog.OverrideFallbackRecognizer = true; return(recog); } catch // (System.Exception e) { return(null); } }
internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture, bool appendWordTerminator) { NumberFormatData nfd = NumberPatternComputer.GetNumberFormatData(culture, true, true); string fstPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd, true, appendWordTerminator); LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(fstPattern); fst.MakeDeterministic(); #if DEBUG bool dump = false; if (dump) { fst.Dump(String.Format("d:/temp/number-fst-{0}.txt", culture.Name)); } #endif return(fst); }
internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture, bool appendWordTerminator) { NumberFormatData nfd = NumberPatternComputer.GetNumberFormatData(culture, true, true); string numberPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd, true, false); System.Text.StringBuilder sb = new StringBuilder(numberPattern); sb.Append("("); bool first = true; NumberPatternComputer.AppendDisjunction(sb, Core.CharacterProperties.Blanks, 'U', ref first); sb.Append(")?("); first = true; Core.Wordlist units = Core.Tokenization.PhysicalUnit.GetUnits(culture, false); foreach (string unit in units.Items) { if (first) { first = false; } else { sb.Append("|"); } // append single unit, make sure that first char emits 'U' (in case no whitespace // sep is in the input) sb.AppendFormat("(<{0}:U>", FST.FST.EscapeSpecial(unit[0])); string remainder = unit.Substring(1); if (!String.IsNullOrEmpty(remainder)) { sb.Append(FST.FST.EscapeSpecial(remainder)); } sb.Append(")"); } sb.Append(")"); if (appendWordTerminator) { // Append "word terminator" sb.Append("#>"); } LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(sb.ToString()); fst.MakeDeterministic(); #if DEBUG bool dump = false; if (dump) { fst.Dump(String.Format("d:/temp/measure-fst-{0}.txt", culture.Name)); } #endif return(fst); }
internal static List <string> ComputeRXPatterns(System.Globalization.CultureInfo culture, NumberSeparatorMode separatorMode, out CharacterSet first) { first = null; if (culture == null) { throw new ArgumentNullException("culture"); } if (culture.IsNeutralCulture) { System.Diagnostics.Debug.Assert(false, "Cannot compute number pattern for neutral cultures"); return(null); } IList <string> digits = culture.NumberFormat.NativeDigits; string decSep = culture.NumberFormat.NumberDecimalSeparator; string grpSep = culture.NumberFormat.NumberGroupSeparator; switch (separatorMode) { case NumberSeparatorMode.CultureDefault: // nop break; case NumberSeparatorMode.Swapped: { string tmp = decSep; decSep = grpSep; grpSep = tmp; } break; case NumberSeparatorMode.EnUS: decSep = "."; grpSep = ","; break; default: throw new Exception("Unexpected"); } List <string> result = ComputeRXPatterns(grpSep, decSep, digits, out first); if (result == null) { return(null); } // add full-width variant digits for jp, zh, ko if (NumberPatternComputer.DoAddFullwidthVariantDigits(culture)) { Core.CharacterSet alternateFirst; digits = "\uFF10|\uFF11|\uFF12|\uFF13|\uFF14|\uFF15|\uFF16|\uFF17|\uFF18|\uFF19".Split('|'); List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst); if (alternate != null) { result.AddRange(alternate); first.Add(alternateFirst); } } // add patters with default digits if (!CultureInfoExtensions.UsesDefaultDigits(culture)) { Core.CharacterSet alternateFirst; digits = "0|1|2|3|4|5|6|7|8|9".Split('|'); List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst); if (alternate != null) { result.AddRange(alternate); first.Add(alternateFirst); } } return(result); }