public override List <ExtractResult> Extract(string text) { var result = new List <ExtractResult>(); if (string.IsNullOrEmpty(text)) { return(result); } var matchSource = new Dictionary <Match, string>(); var matched = new bool[text.Length]; // Traverse every match results to see each position in the text is matched or not. var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value); foreach (var collection in collections) { for (int k = 0; k < text.Length; k++) { matched[k] = false; } foreach (Match m in collection.Key) { if (IsValidMatch(m)) { for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } GetResult(matched, text, matchSource, result); } // Form the extracted results mark all the matched intervals in the text. return(PostFilter(result)); }
public override List <ExtractResult> Extract(string text) { var result = new List <ExtractResult>(); if (string.IsNullOrEmpty(text)) { return(result); } var matchSource = new Dictionary <Match, string>(); var matched = new bool[text.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var lastNotMatched = -1; for (var i = 0; i < text.Length; i++) { if (matched[i]) { if (i + 1 == text.Length || !matched[i + 1]) { var start = lastNotMatched + 1; var length = i - lastNotMatched; var substr = text.Substring(start, length); if (substr.StartsWith(Constants.IPV6_ELLIPSIS) && (start > 0 && char.IsLetterOrDigit(text[start - 1]) && !SimpleTokenizer.IsCjk(text[start - 1]))) { continue; } if (substr.EndsWith(Constants.IPV6_ELLIPSIS) && (i + 1 < text.Length && char.IsLetterOrDigit(text[i + 1]) && !SimpleTokenizer.IsCjk(text[start + 1]))) { continue; } bool MatchFunc(Match o) => o.Index == start && o.Length == length; if (matchSource.Keys.Any(MatchFunc)) { var srcMatch = matchSource.Keys.First(MatchFunc); result.Add(new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null, }); } } } else { lastNotMatched = i; } } return(result); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var result = new List <ExtractResult>(); var matchSource = new Dictionary <Match, TypeTag>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { // In ExperimentalMode, AmbigiuousFraction like "30000 in 2009" needs to be skipped if (Options == NumberOptions.ExperimentalMode && AmbiguousFractionConnectorsRegex.Match(m.Value).Success) { continue; } for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Index == start && o.Length == length)) { var type = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length) .Select(p => (p.Value.Priority, p.Value.Name)).Min().Item2; // Extract negative numbers if (NegativeNumberTermsRegex != null) { var match = NegativeNumberTermsRegex.Match(source.Substring(0, start)); if (match.Success) { start = match.Index; length = length + match.Length; substr = match.Value + substr; } } var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = type }; result.Add(er); } } } else { last = i; } } result = FilterAmbiguity(result, source); return(result); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var result = new List <ExtractResult>(); var matchSource = new Dictionary <Match, TypeTag>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { // In ExperimentalMode, AmbiguousFraction like "30000 in 2009" needs to be skipped if ((Options & NumberOptions.ExperimentalMode) != 0 && AmbiguousFractionConnectorsRegex.Match(m.Value).Success) { continue; } // In EnablePreview, cases like "last", "next" should not be skipped if ((Options & NumberOptions.EnablePreview) == 0 && IsRelativeOrdinal(m.Value)) { continue; } for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Fliter out cases like "first two", "last one" // only support in English now if (ExtractType.Contains(Constants.MODEL_ORDINAL) && RelativeOrdinalFilterRegex != null && RelativeOrdinalFilterRegex.IsMatch(source)) { continue; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Index == start && o.Length == length)) { var type = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length) .Select(p => (p.Value.Priority, p.Value.Name)).Min().Item2; // Extract negative numbers if (NegativeNumberTermsRegex != null) { var match = NegativeNumberTermsRegex.Match(source.Substring(0, start)); if (match.Success) { start = match.Index; length = length + match.Length; substr = match.Value + substr; } } var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = type, }; // Add Metadata information for Ordinal if (ExtractType.Contains(Constants.MODEL_ORDINAL)) { er.Metadata = new Metadata(); if (IsRelativeOrdinal(substr)) { er.Metadata.IsOrdinalRelative = true; } } result.Add(er); } } } else { last = i; } } result = FilterAmbiguity(result, source); return(result); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var results = new List <ExtractResult>(); var matchSource = new Dictionary <Tuple <int, int>, string>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { GetMatchedStartAndLength(m, collection.Value, source, out int start, out int length); if (start >= 0 && length > 0) { for (var j = 0; j < length; j++) { matched[start + j] = true; } // Keep Source Data for extra information matchSource.Add(new Tuple <int, int>(start, length), collection.Value); } } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Item1 == start && o.Item2 == length)) { var srcMatch = matchSource.Keys.First(o => o.Item1 == start && o.Item2 == length); var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null, }; results.Add(er); } } } else { last = i; } } // In ExperimentalMode, cases like "from 3 to 5" and "between 10 and 15" are set to closed at both start and end if ((Options & NumberOptions.ExperimentalMode) != 0) { foreach (var result in results) { if (result.Data.ToString() == NumberRangeConstants.TWONUMBETWEEN || result.Data.ToString() == NumberRangeConstants.TWONUMTILL) { result.Data = NumberRangeConstants.TWONUMCLOSED; } } } return(results); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var result = new List <ExtractResult>(); var matchSource = new Dictionary <Match, string>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Index == start && o.Length == length)) { var srcMatch = matchSource.Keys.First(o => o.Index == start && o.Length == length); var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null }; result.Add(er); } } } else { last = i; } } return(result); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var results = new List <ExtractResult>(); var matchSource = new Dictionary <Tuple <int, int>, string>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { GetMatchedStartAndLength(m, collection.Value, source, out int start, out int length); if (start >= 0 && length > 0) { // Keep Source Data for extra information matchSource.Add(new Tuple <int, int>(start, length), collection.Value); } } } foreach (var match in matchSource) { var start = match.Key.Item1; var length = match.Key.Item2; // Filter wrong two number ranges such as "more than 20 and less than 10" and "大于20小于10". if (match.Value.Equals(NumberRangeConstants.TWONUM, StringComparison.Ordinal)) { int moreIndex = 0, lessIndex = 0; var text = source.Substring(match.Key.Item1, match.Key.Item2); var er = numberExtractor.Extract(text); if (er.Count != 2) { er = ordinalExtractor.Extract(text); if (er.Count != 2) { continue; } } var nums = er.Select(r => (double)(numberParser.Parse(r).Value ?? 0)).ToList(); // Order matchSource by decreasing match length so that "no less than x" is before "less than x" var matchList = matchSource.ToList(); matchList.Sort((pair1, pair2) => pair2.Key.Item2.CompareTo(pair1.Key.Item2)); moreIndex = matchList.First(r => r.Value.Equals(NumberRangeConstants.MORE, StringComparison.Ordinal) && r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1; lessIndex = matchList.First(r => r.Value.Equals(NumberRangeConstants.LESS, StringComparison.Ordinal) && r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1; if (!((nums[0] < nums[1] && moreIndex <= lessIndex) || (nums[0] > nums[1] && moreIndex >= lessIndex))) { continue; } } // The entity is longer than 1, so don't mark the last char to represent the end. // To avoid no connector cases like "大于20小于10" being marked as a whole entity. for (var j = 0; j < length - 1; j++) { matched[start + j] = true; } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last + 1; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Item1 == start && o.Item2 == length)) { var srcMatch = matchSource.Keys.First(o => o.Item1 == start && o.Item2 == length); var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null, }; results.Add(er); } } } else { last = i; } } // In ExperimentalMode, cases like "from 3 to 5" and "between 10 and 15" are set to closed at both start and end if ((Config.Options & NumberOptions.ExperimentalMode) != 0) { foreach (var result in results) { var data = result.Data.ToString(); if (data == NumberRangeConstants.TWONUMBETWEEN || data == NumberRangeConstants.TWONUMTILL) { result.Data = NumberRangeConstants.TWONUMCLOSED; } } } return(results); }
public virtual List <ExtractResult> Extract(string text) { var result = new List <ExtractResult>(); if (string.IsNullOrEmpty(text)) { return(result); } var matchSource = new Dictionary <Match, string>(); var matched = new bool[text.Length]; //Traverse every match results to see each position in the text is matched or not. var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { if (IsValidMatch(m)) { for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } } // Form the extracted results mark all the matched intervals in the text. var lastNotMatched = -1; for (var i = 0; i < text.Length; i++) { if (matched[i]) { if (i + 1 == text.Length || !matched[i + 1]) { var start = lastNotMatched + 1; var length = i - lastNotMatched; var substr = text.Substring(start, length); bool matchFunc(Match o) => o.Index == start && o.Length == length; if (matchSource.Keys.Any(matchFunc)) { var srcMatch = matchSource.Keys.First(matchFunc); result.Add(new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null }); } } } else { lastNotMatched = i; } } return(result); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var result = new List <ExtractResult>(); var matchSource = new Dictionary <Match, TypeTag>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { // In ExperimentalMode, AmbiguousFraction like "30000 in 2009" needs to be skipped if ((Options & NumberOptions.ExperimentalMode) != 0 && AmbiguousFractionConnectorsRegex.Match(m.Value).Success) { continue; } // If SuppressExtendedTypes is on, cases like "last", "next" should be skipped if ((Options & NumberOptions.SuppressExtendedTypes) != 0 && m.Groups[Constants.RelativeOrdinalGroupName].Success) { continue; } // Matches containing separators 'in', 'out of' should be considered fractions only when numerator < denominator if (m.Groups["ambiguousSeparator"].Success) { var numerator = m.Groups["numerator"]; var denominator = m.Groups["denominator"]; int num = ParseNumber(numerator); int den = ParseNumber(denominator); if (num > den) { continue; } } for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Index == start && o.Length == length)) { var(_, type, originalMatch) = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length) .Select(p => (p.Value.Priority, p.Value.Name, p.Key)).Min(); // Extract negative numbers if (NegativeNumberTermsRegex != null) { var match = NegativeNumberTermsRegex.Match(source.Substring(0, start)); if (match.Success) { start = match.Index; length += match.Length; substr = match.Value + substr; } } var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = type, }; // Add Metadata information for Ordinal if (ExtractType.Contains(Constants.MODEL_ORDINAL)) { er.Metadata = new Metadata(); if ((Options & NumberOptions.SuppressExtendedTypes) == 0 && originalMatch.Groups[Constants.RelativeOrdinalGroupName].Success) { er.Metadata.IsOrdinalRelative = true; } } result.Add(er); } } } else { last = i; } } result = FilterAmbiguity(result, source); return(result); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var result = new List <ExtractResult>(); var matchSource = new Dictionary <Match, string>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { matchSource.Add(m, collection.Value); } } matchSource = RankMatches(matchSource); matchSource = RemoveRedundantMatches(matchSource); foreach (var match in matchSource) { for (var i = 0; i < match.Key.Length; i++) { matched[match.Key.Index + i] = true; } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Index == start && o.Length == length)) { var srcMatch = matchSource.Keys.First(o => o.Index == start && o.Length == length); // Extract negative numbers if (NegativeNumberTermsRegex != null) { var match = NegativeNumberTermsRegex.Match(source.Substring(0, start)); if (match.Success) { start = match.Index; length = length + match.Length; substr = match.Value + substr; } } var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null }; result.Add(er); } } } else { last = i; } } return(result); }