private void GetMatchedStartAndLength(Match match, string type, string source, out int start, out int length) { start = NumberRangeConstants.INVALID_NUM; length = NumberRangeConstants.INVALID_NUM; var numberStr1 = match.Groups["number1"].Value; var numberStr2 = match.Groups["number2"].Value; if (type.Contains(NumberRangeConstants.TWONUM)) { var extractNumList1 = ExtractNumberAndOrdinalFromStr(numberStr1); var extractNumList2 = ExtractNumberAndOrdinalFromStr(numberStr2); if (extractNumList1 != null && extractNumList2 != null) { if (type.Contains(NumberRangeConstants.TWONUMTILL)) { // num1 must less than num2 var num1 = (double)(numberParser.Parse(extractNumList1[0]).Value ?? 0); var num2 = (double)(numberParser.Parse(extractNumList2[0]).Value ?? 0); if (num1 > num2) { return; } } bool validNum1 = false, validNum2 = false; start = match.Index; length = match.Length; validNum1 = ValidateMatchAndGetStartAndLength(extractNumList1, numberStr1, match, source, ref start, ref length); validNum2 = ValidateMatchAndGetStartAndLength(extractNumList2, numberStr2, match, source, ref start, ref length); if (!validNum1 || !validNum2) { start = NumberRangeConstants.INVALID_NUM; length = NumberRangeConstants.INVALID_NUM; } } } else { var numberStr = string.IsNullOrEmpty(numberStr1) ? numberStr2 : numberStr1; var extractNumList = ExtractNumberAndOrdinalFromStr(numberStr); if (extractNumList != null) { start = match.Index; length = match.Length; if (!ValidateMatchAndGetStartAndLength(extractNumList, numberStr, match, source, ref start, ref length)) { start = NumberRangeConstants.INVALID_NUM; length = NumberRangeConstants.INVALID_NUM; } } } }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var results = new List <ExtractResult>(); var matchSource = new Dictionary <Tuple <int, int>, string>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { GetMatchedStartAndLength(m, collection.Value, source, out int start, out int length); if (start >= 0 && length > 0) { // Keep Source Data for extra information matchSource.Add(new Tuple <int, int>(start, length), collection.Value); } } } foreach (var match in matchSource) { var start = match.Key.Item1; var length = match.Key.Item2; // Filter wrong two number ranges such as "more than 20 and less than 10" and "大于20小于10". if (match.Value.Equals(NumberRangeConstants.TWONUM, StringComparison.Ordinal)) { int moreIndex = 0, lessIndex = 0; var text = source.Substring(match.Key.Item1, match.Key.Item2); var er = numberExtractor.Extract(text); if (er.Count != 2) { er = ordinalExtractor.Extract(text); if (er.Count != 2) { continue; } } var nums = er.Select(r => (double)(numberParser.Parse(r).Value ?? 0)).ToList(); // Order matchSource by decreasing match length so that "no less than x" is before "less than x" var matchList = matchSource.ToList(); matchList.Sort((pair1, pair2) => pair2.Key.Item2.CompareTo(pair1.Key.Item2)); moreIndex = matchList.First(r => r.Value.Equals(NumberRangeConstants.MORE, StringComparison.Ordinal) && r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1; lessIndex = matchList.First(r => r.Value.Equals(NumberRangeConstants.LESS, StringComparison.Ordinal) && r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1; if (!((nums[0] < nums[1] && moreIndex <= lessIndex) || (nums[0] > nums[1] && moreIndex >= lessIndex))) { continue; } } // The entity is longer than 1, so don't mark the last char to represent the end. // To avoid no connector cases like "大于20小于10" being marked as a whole entity. for (var j = 0; j < length - 1; j++) { matched[start + j] = true; } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last + 1; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Item1 == start && o.Item2 == length)) { var srcMatch = matchSource.Keys.First(o => o.Item1 == start && o.Item2 == length); var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null, }; results.Add(er); } } } else { last = i; } } // In ExperimentalMode, cases like "from 3 to 5" and "between 10 and 15" are set to closed at both start and end if ((Config.Options & NumberOptions.ExperimentalMode) != 0) { foreach (var result in results) { var data = result.Data.ToString(); if (data == NumberRangeConstants.TWONUMBETWEEN || data == NumberRangeConstants.TWONUMTILL) { result.Data = NumberRangeConstants.TWONUMCLOSED; } } } return(results); }