public static Recognizer Create(System.Globalization.CultureInfo culture, int priority)
        {
            try
            {
                // TODO support non-blank languages for unit separation

                MeasureRegexRecognizer result = new MeasureRegexRecognizer(100, "DEFAULT_MEASURE_RECOGNIZER", culture);

                Core.CharacterSet first = null;
                // augmentation doesn't change FIRST()
                List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first);
                AugmentPatterns(patterns, culture);

                foreach (string p in patterns)
                {
                    // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway)
                    result.Add(p, first, 2);
                }

                SeparatorCombination defaultSc = new SeparatorCombination(culture, false);
                if (defaultSc.IsSwappable())
                {
                    patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first);
                    AugmentPatterns(patterns, culture);

                    foreach (string p in patterns)
                    {
                        result.Add(p, first, 1);
                    }
                }
                if (NumberPatternComputer.DoAddENUSSeparators(culture))
                {
                    patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first);
                    AugmentPatterns(patterns, culture);

                    foreach (string p in patterns)
                    {
                        result.Add(p, first, 0);
                    }
                }

                result.OnlyIfFollowedByNonwordCharacter
                    = true;                     // otherwise "123 ABC" will be recognized as "123 A" "BC" in Japanese
                return(result);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
예제 #2
0
        public static Recognizer Create(System.Globalization.CultureInfo culture, int priority)
        {
            try
            {
                NumberRegexRecognizer recog = new NumberRegexRecognizer(100, "DEFAULT_NUMBER_RECOGNIZER", culture.NumberFormat);

                CharacterSet first = null;
                foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first))
                {
                    recog.Add(p, first, 2);
                }

                SeparatorCombination defaultSc = new SeparatorCombination(culture, false);
                if (defaultSc.IsSwappable())
                {
                    foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first))
                    {
                        recog.Add(p, first, 1);
                    }
                }
                if (NumberPatternComputer.DoAddENUSSeparators(culture))
                {
                    foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first))
                    {
                        recog.Add(p, first, 1);
                    }
                }

                recog.OnlyIfFollowedByNonwordCharacter
                    = CultureInfoExtensions.UseBlankAsWordSeparator(culture);
                recog.AdditionalTerminators = new CharacterSet();
                recog.AdditionalTerminators.Add('-');                 // TODO other math symbols?
                recog.OverrideFallbackRecognizer = true;

                return(recog);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
        internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture,
                                                                  bool appendWordTerminator)
        {
            NumberFormatData nfd
                = NumberPatternComputer.GetNumberFormatData(culture, true, true);

            string fstPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd,
                                                                                            true, appendWordTerminator);

            LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(fstPattern);

            fst.MakeDeterministic();

#if DEBUG
            bool dump = false;
            if (dump)
            {
                fst.Dump(String.Format("d:/temp/number-fst-{0}.txt", culture.Name));
            }
#endif

            return(fst);
        }
예제 #4
0
        internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture,
                                                                  bool appendWordTerminator)
        {
            NumberFormatData nfd
                = NumberPatternComputer.GetNumberFormatData(culture, true, true);

            string numberPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd,
                                                                                               true, false);

            System.Text.StringBuilder sb = new StringBuilder(numberPattern);
            sb.Append("(");
            bool first = true;

            NumberPatternComputer.AppendDisjunction(sb, Core.CharacterProperties.Blanks, 'U', ref first);
            sb.Append(")?(");

            first = true;
            Core.Wordlist units = Core.Tokenization.PhysicalUnit.GetUnits(culture, false);
            foreach (string unit in units.Items)
            {
                if (first)
                {
                    first = false;
                }
                else
                {
                    sb.Append("|");
                }

                // append single unit, make sure that first char emits 'U' (in case no whitespace
                //  sep is in the input)
                sb.AppendFormat("(<{0}:U>", FST.FST.EscapeSpecial(unit[0]));
                string remainder = unit.Substring(1);
                if (!String.IsNullOrEmpty(remainder))
                {
                    sb.Append(FST.FST.EscapeSpecial(remainder));
                }
                sb.Append(")");
            }

            sb.Append(")");

            if (appendWordTerminator)
            {
                // Append "word terminator"
                sb.Append("#>");
            }

            LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(sb.ToString());

            fst.MakeDeterministic();

#if DEBUG
            bool dump = false;
            if (dump)
            {
                fst.Dump(String.Format("d:/temp/measure-fst-{0}.txt", culture.Name));
            }
#endif

            return(fst);
        }
예제 #5
0
        internal static List <string> ComputeRXPatterns(System.Globalization.CultureInfo culture,
                                                        NumberSeparatorMode separatorMode,
                                                        out CharacterSet first)
        {
            first = null;

            if (culture == null)
            {
                throw new ArgumentNullException("culture");
            }

            if (culture.IsNeutralCulture)
            {
                System.Diagnostics.Debug.Assert(false, "Cannot compute number pattern for neutral cultures");
                return(null);
            }

            IList <string> digits = culture.NumberFormat.NativeDigits;

            string decSep = culture.NumberFormat.NumberDecimalSeparator;
            string grpSep = culture.NumberFormat.NumberGroupSeparator;

            switch (separatorMode)
            {
            case NumberSeparatorMode.CultureDefault:
                // nop
                break;

            case NumberSeparatorMode.Swapped:
            {
                string tmp = decSep;
                decSep = grpSep;
                grpSep = tmp;
            }
            break;

            case NumberSeparatorMode.EnUS:
                decSep = ".";
                grpSep = ",";
                break;

            default:
                throw new Exception("Unexpected");
            }

            List <string> result = ComputeRXPatterns(grpSep, decSep, digits, out first);

            if (result == null)
            {
                return(null);
            }

            // add full-width variant digits for jp, zh, ko
            if (NumberPatternComputer.DoAddFullwidthVariantDigits(culture))
            {
                Core.CharacterSet alternateFirst;
                digits = "\uFF10|\uFF11|\uFF12|\uFF13|\uFF14|\uFF15|\uFF16|\uFF17|\uFF18|\uFF19".Split('|');
                List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst);
                if (alternate != null)
                {
                    result.AddRange(alternate);
                    first.Add(alternateFirst);
                }
            }

            // add patters with default digits
            if (!CultureInfoExtensions.UsesDefaultDigits(culture))
            {
                Core.CharacterSet alternateFirst;
                digits = "0|1|2|3|4|5|6|7|8|9".Split('|');
                List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst);
                if (alternate != null)
                {
                    result.AddRange(alternate);
                    first.Add(alternateFirst);
                }
            }

            return(result);
        }