public NumberExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>(); // Add Cardinal var cardExtractChs = new CardinalExtractor(config, mode); builder.AddRange(cardExtractChs.Regexes); // Add Fraction var fracExtractChs = new FractionExtractor(config); builder.AddRange(fracExtractChs.Regexes); Regexes = builder.ToImmutable(); var ambiguityBuilder = ImmutableDictionary.CreateBuilder <Regex, Regex>(); // Do not filter the ambiguous number cases like 'that one' in NumberWithUnit, otherwise they can't be resolved. if (config.Mode != NumberMode.Unit) { foreach (var item in NumbersDefinitions.AmbiguityFiltersDict) { ambiguityBuilder.Add(new Regex(item.Key, RegexFlags), new Regex(item.Value, RegexFlags)); } } AmbiguityFiltersDict = ambiguityBuilder.ToImmutable(); }
public IntegerExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var regexes = new Dictionary <Regex, TypeTag>() { { // 123456, -123456 new Regex(NumbersDefinitions.NumbersSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 15k, 16 G new Regex(NumbersDefinitions.NumbersSpecialsCharsWithSuffix, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 1,234, 2,332,111 new Regex(NumbersDefinitions.DottedNumbersSpecialsChar, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 半百 半打 new Regex(NumbersDefinitions.NumbersWithHalfDozen, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE) }, { // 半 new Regex(NumbersDefinitions.HalfUnitRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE) }, { // 一打 五十打 new Regex(NumbersDefinitions.NumbersWithDozen, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE) }, }; switch (mode) { case CJKNumberExtractorMode.Default: // 一百五十五, 负一亿三百二十二. // Uses an allow list to avoid extracting "四" from "四川" regexes.Add( new Regex(NumbersDefinitions.NumbersWithAllowListRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE)); break; case CJKNumberExtractorMode.ExtractAll: // 一百五十五, 负一亿三百二十二, "四" from "四川". // Uses no allow lists and extracts all potential integers (useful in Units, for example). regexes.Add( new Regex(NumbersDefinitions.NumbersAggressiveRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.CHINESE)); break; } Regexes = regexes.ToImmutableDictionary(); }
public IntegerExtractor(CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var regexes = new Dictionary <Regex, TypeTag> { { // 123456, -123456 RegexCache.Get(NumbersDefinitions.NumbersSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 15k, 16 G RegexCache.Get(NumbersDefinitions.NumbersSpecialsCharsWithSuffix, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 1,234, 2,332,111 RegexCache.Get(NumbersDefinitions.DottedNumbersSpecialsChar, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 半百 半ダース RegexCache.Get(NumbersDefinitions.NumbersWithHalfDozen, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE) }, { // 一ダース 五十ダース RegexCache.Get(NumbersDefinitions.NumbersWithDozen, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE) }, }; switch (mode) { case CJKNumberExtractorMode.Default: // 一百五十五, 负一亿三百二十二. // Uses an allow list to avoid extracting "西九条" from "九" regexes.Add( RegexCache.Get(NumbersDefinitions.NumbersWithAllowListRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE)); break; case CJKNumberExtractorMode.ExtractAll: // 一百五十五, 负一亿三百二十二, "西九条" from "九" // Uses no allow lists and extracts all potential integers (useful in Units, for example). regexes.Add( RegexCache.Get(NumbersDefinitions.NumbersAggressiveRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.JAPANESE)); break; } Regexes = regexes.ToImmutableDictionary(); }
// CardinalExtractor = Int + Double public CardinalExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>(); var intExtract = new IntegerExtractor(config, mode); builder.AddRange(intExtract.Regexes); var douExtractor = new DoubleExtractor(config); builder.AddRange(douExtractor.Regexes); Regexes = builder.ToImmutable(); }
// CardinalExtractor = Int + Double public CardinalExtractor(CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>(); var intExtractChs = new IntegerExtractor(mode); builder.AddRange(intExtractChs.Regexes); var douExtractorChs = new DoubleExtractor(); builder.AddRange(douExtractorChs.Regexes); Regexes = builder.ToImmutable(); }
public NumberExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>(); // Add Cardinal var cardExtract = new CardinalExtractor(config, mode); builder.AddRange(cardExtract.Regexes); // Add Fraction var fracExtract = new FractionExtractor(config); builder.AddRange(fracExtract.Regexes); Regexes = builder.ToImmutable(); }
public NumberExtractor(CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var builder = ImmutableDictionary.CreateBuilder <Regex, TypeTag>(); // Add Cardinal var cardExtractChs = new CardinalExtractor(mode); builder.AddRange(cardExtractChs.Regexes); // Add Fraction var fracExtractChs = new FractionExtractor(); builder.AddRange(fracExtractChs.Regexes); Regexes = builder.ToImmutable(); }
public DoubleExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var regexes = new Dictionary <Regex, TypeTag> { { // (-)2.5, can avoid cases like ip address xx.xx.xx.xx new Regex(NumbersDefinitions.DoubleSpecialsCharsWithNegatives, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX) }, { // (-).2 new Regex(NumbersDefinitions.SimpleDoubleSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX) }, { // えは九・二三二一三一二 new Regex(NumbersDefinitions.DoubleRoundNumberSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX) }, { // 15.2万 new Regex(NumbersDefinitions.DoubleWithThousandsRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.JAPANESE) }, { // 2e6, 21.2e0 new Regex(NumbersDefinitions.DoubleExponentialNotationRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.POWER_SUFFIX) }, { new Regex(NumbersDefinitions.DoubleExponentialNotationKanjiRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.POWER_SUFFIX) }, { // 2^5 new Regex(NumbersDefinitions.DoubleScientificNotationRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.POWER_SUFFIX) }, { // 1 234 567.89 GenerateLongFormatNumberRegexes(LongFormatType.DoubleNumFullWidthBlankDot), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX) }, { // 1 234 567.89 GenerateLongFormatNumberRegexes(LongFormatType.DoubleNumBlankDot), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX) }, }; switch (mode) { case CJKNumberExtractorMode.Default: // Uses an allow list to avoid extracting "西九条" from "九" regexes.Add( new Regex(NumbersDefinitions.DoubleSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)); // 1.0 K regexes.Add( new Regex(NumbersDefinitions.DoubleWithMultiplierRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)); break; case CJKNumberExtractorMode.ExtractAll: // Uses no allow lists and extracts all potential numbers (useful in Units, for example). regexes.Add( new Regex(NumbersDefinitions.DoubleSpecialsCharsAggressive, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)); // 1.0 K regexes.Add( new Regex(NumbersDefinitions.DoubleWithMultiplierAggressiveRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.DOUBLE_PREFIX, Constants.NUMBER_SUFFIX)); break; } Regexes = regexes.ToImmutableDictionary(); }
public IntegerExtractor(BaseNumberOptionsConfiguration config, CJKNumberExtractorMode mode = CJKNumberExtractorMode.Default) { var regexes = new Dictionary <Regex, TypeTag> { { // 123456, -123456 new Regex(NumbersDefinitions.NumbersSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 15k, 16 G new Regex(NumbersDefinitions.NumbersSpecialsCharsWithSuffix, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 1,234, 2,332,111 new Regex(NumbersDefinitions.DottedNumbersSpecialsChar, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 마이너스 일, 마이너스 오 new Regex(NumbersDefinitions.ZeroToNineIntegerSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN) }, { // 마이너스 일, 마이너스 오 new Regex(NumbersDefinitions.NumbersSpecialsChars, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN) }, { // 다스 new Regex(NumbersDefinitions.NumbersWithDozen, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN) }, { // 3백21 new Regex(NumbersDefinitions.NativeCumKoreanRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.NUMBER_SUFFIX) }, { // 스물여섯 new Regex(NumbersDefinitions.NativeSingleRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN) }, }; switch (mode) { case CJKNumberExtractorMode.Default: // 일백오십오 regexes.Add( new Regex(NumbersDefinitions.NumbersWithAllowListRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN)); break; case CJKNumberExtractorMode.ExtractAll: // 일백오십오, 사직구장, "사직구장" from "사(it is homonym, seems like four(4) or other chinese character)" // Uses no allow lists and extracts all potential integers (useful in Units, for example). regexes.Add( new Regex(NumbersDefinitions.NumbersAggressiveRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN)); regexes.Add( new Regex(NumbersDefinitions.InexactNumberUnitRegex, RegexFlags), RegexTagGenerator.GenerateRegexTag(Constants.INTEGER_PREFIX, Constants.KOREAN)); break; } Regexes = regexes.ToImmutableDictionary(); }