// For cases like "more than 30000 in 2010", we will not treate "30000 in 2010" as a fraction number // In this method, "30000 in 2010" will be changed to "30000" private List <ExtractResult> RemoveAmbiguousFractions(List <ExtractResult> ers) { foreach (var er in ers) { if (er.Data != null && er.Data.ToString() == RegexTagGenerator.GenerateRegexTag(Constants.FRACTION_PREFIX, Constants.ENGLISH).Name) { var match = AmbiguousFractionConnectorsRegex.Match(er.Text); if (match.Success) { var beforeText = er.Text.Substring(0, match.Index).TrimEnd(); er.Length = beforeText.Length; er.Text = beforeText; er.Type = Constants.SYS_NUM; er.Data = null; } } } return(ers); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var result = new List <ExtractResult>(); var matchSource = new Dictionary <Match, TypeTag>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { // In ExperimentalMode, AmbiguousFraction like "30000 in 2009" needs to be skipped if ((Options & NumberOptions.ExperimentalMode) != 0 && AmbiguousFractionConnectorsRegex.Match(m.Value).Success) { continue; } // In EnablePreview, cases like "last", "next" should not be skipped if ((Options & NumberOptions.EnablePreview) == 0 && IsRelativeOrdinal(m.Value)) { continue; } for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Fliter out cases like "first two", "last one" // only support in English now if (ExtractType.Contains(Constants.MODEL_ORDINAL) && RelativeOrdinalFilterRegex != null && RelativeOrdinalFilterRegex.IsMatch(source)) { continue; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Index == start && o.Length == length)) { var type = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length) .Select(p => (p.Value.Priority, p.Value.Name)).Min().Item2; // Extract negative numbers if (NegativeNumberTermsRegex != null) { var match = NegativeNumberTermsRegex.Match(source.Substring(0, start)); if (match.Success) { start = match.Index; length = length + match.Length; substr = match.Value + substr; } } var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = type, }; // Add Metadata information for Ordinal if (ExtractType.Contains(Constants.MODEL_ORDINAL)) { er.Metadata = new Metadata(); if (IsRelativeOrdinal(substr)) { er.Metadata.IsOrdinalRelative = true; } } result.Add(er); } } } else { last = i; } } result = FilterAmbiguity(result, source); return(result); }
// Fraction with InConnector may lead to some ambiguous cases like "more than 30000 in 2010" // In ExperimentalMode, we will remove all FractionWithInConnector numbers to avoid such cases private bool IsFractionWithInConnector(string numberStr) { return(AmbiguousFractionConnectorsRegex.Match(numberStr).Success); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var result = new List <ExtractResult>(); var matchSource = new Dictionary <Match, TypeTag>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { // In ExperimentalMode, AmbiguousFraction like "30000 in 2009" needs to be skipped if ((Options & NumberOptions.ExperimentalMode) != 0 && AmbiguousFractionConnectorsRegex.Match(m.Value).Success) { continue; } // If SuppressExtendedTypes is on, cases like "last", "next" should be skipped if ((Options & NumberOptions.SuppressExtendedTypes) != 0 && m.Groups[Constants.RelativeOrdinalGroupName].Success) { continue; } // Matches containing separators 'in', 'out of' should be considered fractions only when numerator < denominator if (m.Groups["ambiguousSeparator"].Success) { var numerator = m.Groups["numerator"]; var denominator = m.Groups["denominator"]; int num = ParseNumber(numerator); int den = ParseNumber(denominator); if (num > den) { continue; } } for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Index == start && o.Length == length)) { var(_, type, originalMatch) = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length) .Select(p => (p.Value.Priority, p.Value.Name, p.Key)).Min(); // Extract negative numbers if (NegativeNumberTermsRegex != null) { var match = NegativeNumberTermsRegex.Match(source.Substring(0, start)); if (match.Success) { start = match.Index; length += match.Length; substr = match.Value + substr; } } var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = type, }; // Add Metadata information for Ordinal if (ExtractType.Contains(Constants.MODEL_ORDINAL)) { er.Metadata = new Metadata(); if ((Options & NumberOptions.SuppressExtendedTypes) == 0 && originalMatch.Groups[Constants.RelativeOrdinalGroupName].Success) { er.Metadata.IsOrdinalRelative = true; } } result.Add(er); } } } else { last = i; } } result = FilterAmbiguity(result, source); return(result); }