Example #1
0
        public NumberExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>();

            // Add Cardinal
            var cardExtractChs = new CardinalExtractor(config, mode);

            builder.AddRange(cardExtractChs.Regexes);

            // Add Fraction
            var fracExtractChs = new FractionExtractor(config);

            builder.AddRange(fracExtractChs.Regexes);

            Regexes = builder.ToImmutable();

            var ambiguityBuilder = ImmutableDictionary.CreateBuilder <Regex, Regex>();

            // Do not filter the ambiguous number cases like 'that one' in NumberWithUnit, otherwise they can't be resolved.
            if (config.Mode != NumberMode.Unit)
            {
                foreach (var item in NumbersDefinitions.AmbiguityFiltersDict)
                {
                    ambiguityBuilder.Add(new Regex(item.Key, RegexFlags), new Regex(item.Value, RegexFlags));
                }
            }

            AmbiguityFiltersDict = ambiguityBuilder.ToImmutable();
        }
Example #2
0
        public IntegerExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var regexes = new Dictionary <Regex, TypeTag>()
            {
                {
                    // 123456,  -123456
                    new Regex(NumbersDefinitions.NumbersSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 15k,  16 G
                    new Regex(NumbersDefinitions.NumbersSpecialsCharsWithSuffix, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 1,234,  2,332,111
                    new Regex(NumbersDefinitions.DottedNumbersSpecialsChar, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 半百  半打
                    new Regex(NumbersDefinitions.NumbersWithHalfDozen, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE)
                },
                {
                    // 半
                    new Regex(NumbersDefinitions.HalfUnitRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE)
                },
                {
                    // 一打  五十打
                    new Regex(NumbersDefinitions.NumbersWithDozen, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE)
                },
            };

            switch (mode)
            {
            case CJKNumberExtractorMode.Default:
                // 一百五十五, 负一亿三百二十二.
                // Uses an allow list to avoid extracting "四" from "四川"
                regexes.Add(
                    new Regex(NumbersDefinitions.NumbersWithAllowListRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE));
                break;

            case CJKNumberExtractorMode.ExtractAll:
                // 一百五十五, 负一亿三百二十二, "四" from "四川".
                // Uses no allow lists and extracts all potential integers (useful in Units, for example).
                regexes.Add(
                    new Regex(NumbersDefinitions.NumbersAggressiveRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE));
                break;
            }

            Regexes = regexes.ToImmutableDictionary();
        }
        public IntegerExtractor(CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var regexes = new Dictionary <Regex, TypeTag>
            {
                {
                    // 123456,  -123456
                    RegexCache.Get(NumbersDefinitions.NumbersSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 15k,  16 G
                    RegexCache.Get(NumbersDefinitions.NumbersSpecialsCharsWithSuffix, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 1,234,  2,332,111
                    RegexCache.Get(NumbersDefinitions.DottedNumbersSpecialsChar, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 半百  半ダース
                    RegexCache.Get(NumbersDefinitions.NumbersWithHalfDozen, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE)
                },
                {
                    // 一ダース  五十ダース
                    RegexCache.Get(NumbersDefinitions.NumbersWithDozen, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE)
                },
            };

            switch (mode)
            {
            case CJKNumberExtractorMode.Default:
                // 一百五十五, 负一亿三百二十二.
                // Uses an allow list to avoid extracting "西九条" from "九"
                regexes.Add(
                    RegexCache.Get(NumbersDefinitions.NumbersWithAllowListRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE));
                break;

            case CJKNumberExtractorMode.ExtractAll:
                // 一百五十五, 负一亿三百二十二, "西九条" from "九"
                // Uses no allow lists and extracts all potential integers (useful in Units, for example).
                regexes.Add(
                    RegexCache.Get(NumbersDefinitions.NumbersAggressiveRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE));
                break;
            }

            Regexes = regexes.ToImmutableDictionary();
        }
        // CardinalExtractor = Int + Double
        public CardinalExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>();

            var intExtract = new IntegerExtractor(config, mode);

            builder.AddRange(intExtract.Regexes);

            var douExtractor = new DoubleExtractor(config);

            builder.AddRange(douExtractor.Regexes);

            Regexes = builder.ToImmutable();
        }
Example #5
0
        // CardinalExtractor = Int + Double
        public CardinalExtractor(CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>();

            var intExtractChs = new IntegerExtractor(mode);

            builder.AddRange(intExtractChs.Regexes);

            var douExtractorChs = new DoubleExtractor();

            builder.AddRange(douExtractorChs.Regexes);

            Regexes = builder.ToImmutable();
        }
        public NumberExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>();

            // Add Cardinal
            var cardExtract = new CardinalExtractor(config, mode);

            builder.AddRange(cardExtract.Regexes);

            // Add Fraction
            var fracExtract = new FractionExtractor(config);

            builder.AddRange(fracExtract.Regexes);

            Regexes = builder.ToImmutable();
        }
        public NumberExtractor(CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>();

            // Add Cardinal
            var cardExtractChs = new CardinalExtractor(mode);

            builder.AddRange(cardExtractChs.Regexes);

            // Add Fraction
            var fracExtractChs = new FractionExtractor();

            builder.AddRange(fracExtractChs.Regexes);

            Regexes = builder.ToImmutable();
        }
        public DoubleExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var regexes = new Dictionary <Regex, TypeTag>
            {
                {
                    // (-)2.5, can avoid cases like ip address xx.xx.xx.xx
                    new Regex(NumbersDefinitions.DoubleSpecialsCharsWithNegatives, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // (-).2
                    new Regex(NumbersDefinitions.SimpleDoubleSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // えは九・二三二一三一二
                    new Regex(NumbersDefinitions.DoubleRoundNumberSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 15.2万
                    new Regex(NumbersDefinitions.DoubleWithThousandsRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.JAPANESE)
                },
                {
                    // 2e6, 21.2e0
                    new Regex(NumbersDefinitions.DoubleExponentialNotationRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.POWER_SUFFIX)
                },
                {
                    new Regex(NumbersDefinitions.DoubleExponentialNotationKanjiRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.POWER_SUFFIX)
                },
                {
                    // 2^5
                    new Regex(NumbersDefinitions.DoubleScientificNotationRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.POWER_SUFFIX)
                },
                {
                    // 1 234 567.89
                    GenerateLongFormatNumberRegexes(LongFormatType.DoubleNumFullWidthBlankDot),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 1 234 567.89
                    GenerateLongFormatNumberRegexes(LongFormatType.DoubleNumBlankDot),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)
                },
            };

            switch (mode)
            {
            case CJKNumberExtractorMode.Default:
                // Uses an allow list to avoid extracting "西九条" from "九"
                regexes.Add(
                    new Regex(NumbersDefinitions.DoubleSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX));

                // 1.0 K
                regexes.Add(
                    new Regex(NumbersDefinitions.DoubleWithMultiplierRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX));
                break;

            case CJKNumberExtractorMode.ExtractAll:
                // Uses no allow lists and extracts all potential numbers (useful in Units, for example).
                regexes.Add(
                    new Regex(NumbersDefinitions.DoubleSpecialsCharsAggressive, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX));

                // 1.0 K
                regexes.Add(
                    new Regex(NumbersDefinitions.DoubleWithMultiplierAggressiveRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX));
                break;
            }

            Regexes = regexes.ToImmutableDictionary();
        }
        public IntegerExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default)
        {
            var regexes = new Dictionary <Regex, TypeTag>
            {
                {
                    // 123456,  -123456
                    new Regex(NumbersDefinitions.NumbersSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 15k,  16 G
                    new Regex(NumbersDefinitions.NumbersSpecialsCharsWithSuffix, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 1,234,  2,332,111
                    new Regex(NumbersDefinitions.DottedNumbersSpecialsChar, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 마이너스 일, 마이너스 오
                    new Regex(NumbersDefinitions.ZeroToNineIntegerSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN)
                },
                {
                    // 마이너스 일, 마이너스 오
                    new Regex(NumbersDefinitions.NumbersSpecialsChars, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN)
                },
                {
                    // 다스
                    new Regex(NumbersDefinitions.NumbersWithDozen, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN)
                },
                {
                    // 3백21
                    new Regex(NumbersDefinitions.NativeCumKoreanRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX)
                },
                {
                    // 스물여섯
                    new Regex(NumbersDefinitions.NativeSingleRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN)
                },
            };

            switch (mode)
            {
            case CJKNumberExtractorMode.Default:
                // 일백오십오
                regexes.Add(
                    new Regex(NumbersDefinitions.NumbersWithAllowListRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN));
                break;

            case CJKNumberExtractorMode.ExtractAll:
                // 일백오십오, 사직구장, "사직구장" from "사(it is homonym, seems like four(4) or other chinese character)"
                // Uses no allow lists and extracts all potential integers (useful in Units, for example).
                regexes.Add(
                    new Regex(NumbersDefinitions.NumbersAggressiveRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN));
                regexes.Add(
                    new Regex(NumbersDefinitions.InexactNumberUnitRegex, RegexFlags),
                    RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN));
                break;
            }

            Regexes = regexes.ToImmutableDictionary();
        }