public static NumberFormatData GetCanonicalNumberFormatData(bool addSeparatorVariants, bool augmentWhitespaceGroupSeparators) { NumberFormatData result = new NumberFormatData(); foreach (System.Globalization.CultureInfo ci in System.Globalization.CultureInfo.GetCultures(System.Globalization.CultureTypes.AllCultures)) { if (ci.IsNeutralCulture) { continue; } NumberFormatData ciData = GetNumberFormatData(ci, addSeparatorVariants, augmentWhitespaceGroupSeparators); if (ciData == null) { continue; } if (result == null) { result = ciData; } else { result.Merge(ciData); } } return(result); }
// only needed for canonical pattern computation (not used) public void Merge(NumberFormatData other) { if (other == null) { throw new ArgumentNullException(); } PositiveSigns = Core.StringUtilities.MergeStrings(PositiveSigns, other.PositiveSigns); NegativeSigns = Core.StringUtilities.MergeStrings(NegativeSigns, other.NegativeSigns); foreach (string s in other.Digits) { if (!Digits.Contains(s)) { Digits.Add(s); } } foreach (SeparatorCombination sc in other.SeparatorCombinations) { if (!SeparatorCombinations.Contains(sc)) { SeparatorCombinations.Add(sc); } } }
internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture, bool appendWordTerminator) { NumberFormatData nfd = NumberPatternComputer.GetNumberFormatData(culture, true, true); string fstPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd, true, appendWordTerminator); LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(fstPattern); fst.MakeDeterministic(); #if DEBUG bool dump = false; if (dump) { fst.Dump(String.Format("d:/temp/number-fst-{0}.txt", culture.Name)); } #endif return(fst); }
internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture, bool appendWordTerminator) { NumberFormatData nfd = NumberPatternComputer.GetNumberFormatData(culture, true, true); string numberPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd, true, false); System.Text.StringBuilder sb = new StringBuilder(numberPattern); sb.Append("("); bool first = true; NumberPatternComputer.AppendDisjunction(sb, Core.CharacterProperties.Blanks, 'U', ref first); sb.Append(")?("); first = true; Core.Wordlist units = Core.Tokenization.PhysicalUnit.GetUnits(culture, false); foreach (string unit in units.Items) { if (first) { first = false; } else { sb.Append("|"); } // append single unit, make sure that first char emits 'U' (in case no whitespace // sep is in the input) sb.AppendFormat("(<{0}:U>", FST.FST.EscapeSpecial(unit[0])); string remainder = unit.Substring(1); if (!String.IsNullOrEmpty(remainder)) { sb.Append(FST.FST.EscapeSpecial(remainder)); } sb.Append(")"); } sb.Append(")"); if (appendWordTerminator) { // Append "word terminator" sb.Append("#>"); } LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(sb.ToString()); fst.MakeDeterministic(); #if DEBUG bool dump = false; if (dump) { fst.Dump(String.Format("d:/temp/measure-fst-{0}.txt", culture.Name)); } #endif return(fst); }
public static string ComputeFSTPattern(NumberFormatData data, bool treatFirstSeparatorsAsPrimarySeparators, bool appendWordTerminator) { // TODO support non-standard groupings (Hindi) // TODO support negative patterns (particularly minus sign following numeric part) // we also only recognize prefixed signs, but no optional blank between the sign and the number. // Other number patters (such as "(x)" for a negative number in US accounting) are unsupported. // ---------------------------------------- compute RX for a single digit string dig = ComputeSingleDigit(data.Digits); // TODO observe data.NegativeSigns, data.PositiveSigns // TODO escape signs just in case they interfere with FST symbols string optSign = ComputeSign("+\uFF0B", "-\u2013\u2212\uFF0D", true); string reqSign = ComputeSign("+\uFF0B", "-\u2013\u2212\uFF0D", false); // --- the ungrouped case (optional group separators) StringBuilder ungrouped = new StringBuilder(); string decimalSeparators = data.GetCombinedDecimalSeparators(); ungrouped.AppendFormat("({0}+((", dig); bool first = true; for (int p = 0; p < decimalSeparators.Length; ++p) { if (p == 0 && treatFirstSeparatorsAsPrimarySeparators) { AppendDisjunction(ungrouped, decimalSeparators[p], 'D', ref first); } else { AppendDisjunction(ungrouped, decimalSeparators[p], 'd', ref first); } } ungrouped.AppendFormat("){0}+)?)", dig); System.Diagnostics.Debug.Assert(IsBalanced(ungrouped.ToString())); // --- the grouped case StringBuilder grouped = new StringBuilder(); grouped.AppendFormat("{0}({1}{2}?)?(", dig, dig, dig); first = true; foreach (SeparatorCombination sc in data.SeparatorCombinations) { System.Diagnostics.Debug.Assert(!String.IsNullOrEmpty(sc.GroupSeparators)); System.Diagnostics.Debug.Assert(!String.IsNullOrEmpty(sc.DecimalSeparators)); System.Text.StringBuilder scPattern = new StringBuilder(); scPattern.Append("("); for (int p = 0; p < sc.GroupSeparators.Length; ++p) { // we expect the same group separators in the number sequence, not _any_ // group separator. Can't use sth as "[gs][0-9]{3}". char outputSymbol = 'g'; if (first && treatFirstSeparatorsAsPrimarySeparators) { outputSymbol = 'G'; } if (p > 0) { scPattern.Append("|"); } scPattern.AppendFormat("((<{0}:{1}>{2}{3}{4})+)", sc.GroupSeparators[p], outputSymbol, dig, dig, dig); } scPattern.Append(")"); System.Diagnostics.Debug.Assert(IsBalanced(scPattern.ToString())); // the optional fractional part. NOTE: we assume the group and decimal seps are disjunct. scPattern.Append("(("); for (int p = 0; p < sc.DecimalSeparators.Length; ++p) { char outputSymbol = 'd'; if (first && treatFirstSeparatorsAsPrimarySeparators) { outputSymbol = 'D'; } if (p > 0) { scPattern.Append("|"); } scPattern.AppendFormat("<{0}:{1}>", sc.DecimalSeparators[p], outputSymbol); } scPattern.AppendFormat("){0}+)?", dig); System.Diagnostics.Debug.Assert(IsBalanced(scPattern.ToString())); if (!first) { grouped.AppendFormat("|"); } grouped.AppendFormat("({0})", scPattern.ToString()); if (first) { first = false; } } grouped.Append(")"); System.Diagnostics.Debug.Assert(IsBalanced(grouped.ToString())); // ----- StringBuilder result = new StringBuilder(); if (!AllowTrailingSign || data.NumberNegativePattern < 3) { // sign precedes numeric part result.AppendFormat("({0}({1}|{2}))", optSign, grouped.ToString(), ungrouped.ToString()); } else { // sign follows numeric part, but int.ToString() may generate both forms, // so we need to build a recognizer which handles both. // NOTE using a required sign for leading sign to avoid ambiguous RX/matches // for numbers w/o signs result.AppendFormat("(({0}({1}|{2}))|(({1}|{2}){3}))", reqSign, grouped.ToString(), ungrouped.ToString(), optSign); } if (appendWordTerminator) { // Append "word terminator" result.Append("#>"); } return(result.ToString()); }
/// <summary> /// <param name="augmentWhitespaceGroupSeparators">If true, blanks/nbsp's will be interchangeable</param> /// </summary> public static NumberFormatData GetNumberFormatData(System.Globalization.CultureInfo culture, bool addSeparatorVariants, bool augmentWhitespaceGroupSeparators) { if (culture == null) { throw new ArgumentNullException(); } if (culture.IsNeutralCulture) { throw new ArgumentException("Cannot compute number format information for neutral cultures"); } string nativeDigits = String.Join("", culture.NumberFormat.NativeDigits); if (nativeDigits.Length != 10) { return(null); } NumberFormatData result = new NumberFormatData(); result.Digits.Add(nativeDigits); if (!String.Equals(nativeDigits, DefaultDigits)) { result.Digits.Add(DefaultDigits); } if (DoAddFullwidthVariantDigits(culture) && !result.Digits.Contains(FullWidthDigits)) { result.Digits.Add(FullWidthDigits); } result.NegativeSigns = culture.NumberFormat.NegativeSign; result.PositiveSigns = culture.NumberFormat.PositiveSign; result.NumberGroupSizes = culture.NumberFormat.NumberGroupSizes; result.NumberNegativePattern = culture.NumberFormat.NumberNegativePattern; result.AddSeparatorCombination(culture.NumberFormat.NumberGroupSeparator, culture.NumberFormat.NumberDecimalSeparator, augmentWhitespaceGroupSeparators); result.AddSeparatorCombination(culture.NumberFormat.CurrencyGroupSeparator, culture.NumberFormat.CurrencyDecimalSeparator, augmentWhitespaceGroupSeparators); if (addSeparatorVariants) { switch (culture.TwoLetterISOLanguageName.ToLowerInvariant()) { case "fr": result.AddSeparatorCombination(",", ".", false); break; case "pl": result.AddSeparatorCombination(".", ",", false); break; default: break; } for (int p = result.SeparatorCombinations.Count - 1; p >= 0; --p) { SeparatorCombination sc = result.SeparatorCombinations[p]; if (sc.IsSwappable()) { // note: swapping separators here result.AddSeparatorCombination(sc.DecimalSeparators, sc.GroupSeparators, false); } } } return(result); }