public static NumberFormatData GetCanonicalNumberFormatData(bool addSeparatorVariants,
                                                                    bool augmentWhitespaceGroupSeparators)
        {
            NumberFormatData result = new NumberFormatData();

            foreach (System.Globalization.CultureInfo ci in System.Globalization.CultureInfo.GetCultures(System.Globalization.CultureTypes.AllCultures))
            {
                if (ci.IsNeutralCulture)
                {
                    continue;
                }

                NumberFormatData ciData = GetNumberFormatData(ci,
                                                              addSeparatorVariants, augmentWhitespaceGroupSeparators);
                if (ciData == null)
                {
                    continue;
                }

                if (result == null)
                {
                    result = ciData;
                }
                else
                {
                    result.Merge(ciData);
                }
            }
            return(result);
        }
        // only needed for canonical pattern computation (not used)

        public void Merge(NumberFormatData other)
        {
            if (other == null)
            {
                throw new ArgumentNullException();
            }

            PositiveSigns = Core.StringUtilities.MergeStrings(PositiveSigns, other.PositiveSigns);
            NegativeSigns = Core.StringUtilities.MergeStrings(NegativeSigns, other.NegativeSigns);

            foreach (string s in other.Digits)
            {
                if (!Digits.Contains(s))
                {
                    Digits.Add(s);
                }
            }

            foreach (SeparatorCombination sc in other.SeparatorCombinations)
            {
                if (!SeparatorCombinations.Contains(sc))
                {
                    SeparatorCombinations.Add(sc);
                }
            }
        }
        internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture,
                                                                  bool appendWordTerminator)
        {
            NumberFormatData nfd
                = NumberPatternComputer.GetNumberFormatData(culture, true, true);

            string fstPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd,
                                                                                            true, appendWordTerminator);

            LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(fstPattern);

            fst.MakeDeterministic();

#if DEBUG
            bool dump = false;
            if (dump)
            {
                fst.Dump(String.Format("d:/temp/number-fst-{0}.txt", culture.Name));
            }
#endif

            return(fst);
        }
예제 #4
0
        internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture,
                                                                  bool appendWordTerminator)
        {
            NumberFormatData nfd
                = NumberPatternComputer.GetNumberFormatData(culture, true, true);

            string numberPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd,
                                                                                               true, false);

            System.Text.StringBuilder sb = new StringBuilder(numberPattern);
            sb.Append("(");
            bool first = true;

            NumberPatternComputer.AppendDisjunction(sb, Core.CharacterProperties.Blanks, 'U', ref first);
            sb.Append(")?(");

            first = true;
            Core.Wordlist units = Core.Tokenization.PhysicalUnit.GetUnits(culture, false);
            foreach (string unit in units.Items)
            {
                if (first)
                {
                    first = false;
                }
                else
                {
                    sb.Append("|");
                }

                // append single unit, make sure that first char emits 'U' (in case no whitespace
                //  sep is in the input)
                sb.AppendFormat("(<{0}:U>", FST.FST.EscapeSpecial(unit[0]));
                string remainder = unit.Substring(1);
                if (!String.IsNullOrEmpty(remainder))
                {
                    sb.Append(FST.FST.EscapeSpecial(remainder));
                }
                sb.Append(")");
            }

            sb.Append(")");

            if (appendWordTerminator)
            {
                // Append "word terminator"
                sb.Append("#>");
            }

            LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(sb.ToString());

            fst.MakeDeterministic();

#if DEBUG
            bool dump = false;
            if (dump)
            {
                fst.Dump(String.Format("d:/temp/measure-fst-{0}.txt", culture.Name));
            }
#endif

            return(fst);
        }
        public static string ComputeFSTPattern(NumberFormatData data,
                                               bool treatFirstSeparatorsAsPrimarySeparators,
                                               bool appendWordTerminator)
        {
            // TODO support non-standard groupings (Hindi)
            // TODO support negative patterns (particularly minus sign following numeric part)

            // we also only recognize prefixed signs, but no optional blank between the sign and the number.
            // Other number patters (such as "(x)" for a negative number in US accounting) are unsupported.

            // ---------------------------------------- compute RX for a single digit

            string dig = ComputeSingleDigit(data.Digits);

            // TODO observe data.NegativeSigns, data.PositiveSigns
            // TODO escape signs just in case they interfere with FST symbols
            string optSign = ComputeSign("+\uFF0B", "-\u2013\u2212\uFF0D", true);
            string reqSign = ComputeSign("+\uFF0B", "-\u2013\u2212\uFF0D", false);

            // --- the ungrouped case (optional group separators)

            StringBuilder ungrouped         = new StringBuilder();
            string        decimalSeparators = data.GetCombinedDecimalSeparators();

            ungrouped.AppendFormat("({0}+((", dig);
            bool first = true;

            for (int p = 0; p < decimalSeparators.Length; ++p)
            {
                if (p == 0 && treatFirstSeparatorsAsPrimarySeparators)
                {
                    AppendDisjunction(ungrouped, decimalSeparators[p], 'D', ref first);
                }
                else
                {
                    AppendDisjunction(ungrouped, decimalSeparators[p], 'd', ref first);
                }
            }

            ungrouped.AppendFormat("){0}+)?)", dig);

            System.Diagnostics.Debug.Assert(IsBalanced(ungrouped.ToString()));

            // --- the grouped case

            StringBuilder grouped = new StringBuilder();

            grouped.AppendFormat("{0}({1}{2}?)?(", dig, dig, dig);

            first = true;
            foreach (SeparatorCombination sc in data.SeparatorCombinations)
            {
                System.Diagnostics.Debug.Assert(!String.IsNullOrEmpty(sc.GroupSeparators));
                System.Diagnostics.Debug.Assert(!String.IsNullOrEmpty(sc.DecimalSeparators));

                System.Text.StringBuilder scPattern = new StringBuilder();

                scPattern.Append("(");
                for (int p = 0; p < sc.GroupSeparators.Length; ++p)
                {
                    // we expect the same group separators in the number sequence, not _any_
                    //  group separator. Can't use sth as "[gs][0-9]{3}".

                    char outputSymbol = 'g';
                    if (first && treatFirstSeparatorsAsPrimarySeparators)
                    {
                        outputSymbol = 'G';
                    }

                    if (p > 0)
                    {
                        scPattern.Append("|");
                    }

                    scPattern.AppendFormat("((<{0}:{1}>{2}{3}{4})+)",
                                           sc.GroupSeparators[p], outputSymbol, dig, dig, dig);
                }
                scPattern.Append(")");
                System.Diagnostics.Debug.Assert(IsBalanced(scPattern.ToString()));

                // the optional fractional part. NOTE: we assume the group and decimal seps are disjunct.

                scPattern.Append("((");
                for (int p = 0; p < sc.DecimalSeparators.Length; ++p)
                {
                    char outputSymbol = 'd';
                    if (first && treatFirstSeparatorsAsPrimarySeparators)
                    {
                        outputSymbol = 'D';
                    }

                    if (p > 0)
                    {
                        scPattern.Append("|");
                    }

                    scPattern.AppendFormat("<{0}:{1}>",
                                           sc.DecimalSeparators[p], outputSymbol);
                }
                scPattern.AppendFormat("){0}+)?", dig);

                System.Diagnostics.Debug.Assert(IsBalanced(scPattern.ToString()));

                if (!first)
                {
                    grouped.AppendFormat("|");
                }
                grouped.AppendFormat("({0})", scPattern.ToString());

                if (first)
                {
                    first = false;
                }
            }

            grouped.Append(")");

            System.Diagnostics.Debug.Assert(IsBalanced(grouped.ToString()));

            // -----

            StringBuilder result = new StringBuilder();

            if (!AllowTrailingSign || data.NumberNegativePattern < 3)
            {
                // sign precedes numeric part
                result.AppendFormat("({0}({1}|{2}))", optSign,
                                    grouped.ToString(), ungrouped.ToString());
            }
            else
            {
                // sign follows numeric part, but int.ToString() may generate both forms,
                //  so we need to build a recognizer which handles both.
                // NOTE using a required sign for leading sign to avoid ambiguous RX/matches
                //  for numbers w/o signs
                result.AppendFormat("(({0}({1}|{2}))|(({1}|{2}){3}))", reqSign,
                                    grouped.ToString(), ungrouped.ToString(), optSign);
            }

            if (appendWordTerminator)
            {
                // Append "word terminator"
                result.Append("#>");
            }

            return(result.ToString());
        }
        /// <summary>
        /// <param name="augmentWhitespaceGroupSeparators">If true, blanks/nbsp's will be interchangeable</param>
        /// </summary>
        public static NumberFormatData GetNumberFormatData(System.Globalization.CultureInfo culture,
                                                           bool addSeparatorVariants, bool augmentWhitespaceGroupSeparators)
        {
            if (culture == null)
            {
                throw new ArgumentNullException();
            }
            if (culture.IsNeutralCulture)
            {
                throw new ArgumentException("Cannot compute number format information for neutral cultures");
            }

            string nativeDigits = String.Join("", culture.NumberFormat.NativeDigits);

            if (nativeDigits.Length != 10)
            {
                return(null);
            }

            NumberFormatData result = new NumberFormatData();

            result.Digits.Add(nativeDigits);
            if (!String.Equals(nativeDigits, DefaultDigits))
            {
                result.Digits.Add(DefaultDigits);
            }

            if (DoAddFullwidthVariantDigits(culture) &&
                !result.Digits.Contains(FullWidthDigits))
            {
                result.Digits.Add(FullWidthDigits);
            }

            result.NegativeSigns         = culture.NumberFormat.NegativeSign;
            result.PositiveSigns         = culture.NumberFormat.PositiveSign;
            result.NumberGroupSizes      = culture.NumberFormat.NumberGroupSizes;
            result.NumberNegativePattern = culture.NumberFormat.NumberNegativePattern;

            result.AddSeparatorCombination(culture.NumberFormat.NumberGroupSeparator,
                                           culture.NumberFormat.NumberDecimalSeparator, augmentWhitespaceGroupSeparators);
            result.AddSeparatorCombination(culture.NumberFormat.CurrencyGroupSeparator,
                                           culture.NumberFormat.CurrencyDecimalSeparator, augmentWhitespaceGroupSeparators);

            if (addSeparatorVariants)
            {
                switch (culture.TwoLetterISOLanguageName.ToLowerInvariant())
                {
                case "fr":
                    result.AddSeparatorCombination(",", ".", false);
                    break;

                case "pl":
                    result.AddSeparatorCombination(".", ",", false);
                    break;

                default:
                    break;
                }

                for (int p = result.SeparatorCombinations.Count - 1; p >= 0; --p)
                {
                    SeparatorCombination sc = result.SeparatorCombinations[p];
                    if (sc.IsSwappable())
                    {
                        // note: swapping separators here
                        result.AddSeparatorCombination(sc.DecimalSeparators, sc.GroupSeparators, false);
                    }
                }
            }

            return(result);
        }