// TODO: this should not be in the NumberRangeExtractor as it doesn't handle duration concepts private List <ExtractResult> ExtractNumberAndOrdinalFromStr(string numberStr, bool isAmbiguousRangeOrFraction = false) { List <ExtractResult> ret = null; var extractNumber = numberExtractor.Extract(numberStr); var extractOrdinal = ordinalExtractor.Extract(numberStr); if (extractNumber.Count == 0) { ret = extractOrdinal.Count == 0 ? null : extractOrdinal; } else if (extractOrdinal.Count == 0) { ret = extractNumber; } else { ret = new List <ExtractResult>(); ret.AddRange(extractNumber); ret.AddRange(extractOrdinal); ret = ret.OrderByDescending(num => num.Length).ThenByDescending(num => num.Start).ToList(); } var removeFractionWithInConnector = ShouldRemoveFractionWithInConnector(numberStr); if (ret != null && (removeFractionWithInConnector || isAmbiguousRangeOrFraction)) { ret = RemoveAmbiguousFractions(ret); } return(ret); }
private List <ExtractResult> ExtractNumberAndOrdinalFromStr(string numberStr) { var extractNumber = numberExtractor.Extract(numberStr); var extractOrdinal = ordinalExtractor.Extract(numberStr); if (extractNumber.Count == 0) { return(extractOrdinal.Count == 0 ? null : extractOrdinal); } if (extractOrdinal.Count == 0) { return(extractNumber); } extractNumber.AddRange(extractOrdinal); extractNumber = extractNumber.OrderByDescending(num => num.Length).ThenByDescending(num => num.Start).ToList(); return(extractNumber); }
/// <summary> /// get the number extractor results and convert the extracted numbers to @sys.num, so that the regexes can work. /// </summary> /// <param name="str">sentence to process.</param> /// <param name="positionMap">position Map.</param> /// <param name="numExtResults">number extractor result.</param> /// <returns>return according type "builtin.num" or "builtin.num.percentage".</returns> private string PreprocessStrWithNumberExtracted( string str, out Dictionary <int, int> positionMap, out IList <ExtractResult> numExtResults) { positionMap = new Dictionary <int, int>(); numExtResults = numberExtractor.Extract(str); bool percentModeEnabled = (Options & NumberOptions.PercentageMode) != 0; // @TODO potential cause of GC var match = new int[str.Length]; var strParts = new List <Tuple <int, int> >(); int start, end; for (int i = 0; i < str.Length; i++) { match[i] = 0; } for (int i = 0; i < numExtResults.Count; i++) { var extraction = numExtResults[i]; start = (int)extraction.Start; end = (int)extraction.Length + start; for (var j = start; j < end; j++) { if (match[j] == 0) { if (percentModeEnabled && extraction.Data.ToString().StartsWith("Frac", StringComparison.Ordinal)) { match[j] = -(i + 1); } else { match[j] = i + 1; } } } } start = 0; for (int i = 1; i < str.Length; i++) { if (match[i] != match[i - 1]) { strParts.Add(new Tuple <int, int>(start, i - 1)); start = i; } } strParts.Add(new Tuple <int, int>(start, str.Length - 1)); string ret = string.Empty; int index = 0; foreach (var strPart in strParts) { start = strPart.Item1; end = strPart.Item2; int type = match[start]; if (type == 0) { // subsequence which won't be extracted ret += str.Substring(start, end - start + 1); for (int i = start; i <= end; i++) { positionMap.Add(index++, i); } } else { // subsequence which will be extracted as number, type is negative for fraction number extraction var replaceText = type > 0 ? NumberPlaceHolder : FractionPlaceHolder; ret += replaceText; for (int i = 0; i < replaceText.Length; i++) { positionMap.Add(index++, start); } } } positionMap.Add(index, str.Length); return(ret); }
/// <summary> /// get the number extractor results and convert the extracted numbers to @sys.num, so that the regexes can work /// </summary> /// <param name="str"></param> /// <returns></returns> private string PreprocessStrWithNumberExtracted(string str, out Dictionary <int, int> positionMap, out IList <ExtractResult> numExtResults) { positionMap = new Dictionary <int, int>(); numExtResults = numberExtractor.Extract(str); string replaceText = "@" + NumExtType; //@TODO pontential cause of GC int[] match = new int[str.Length]; List <Tuple <int, int> > strParts = new List <Tuple <int, int> >(); int start, end; for (int i = 0; i < str.Length; i++) { match[i] = -1; } for (int i = 0; i < numExtResults.Count; i++) { var extraction = numExtResults[i]; string subtext = extraction.Text; start = (int)extraction.Start; end = (int)extraction.Length + start; for (int j = start; j < end; j++) { if (match[j] == -1) { match[j] = i; } } } start = 0; for (int i = 1; i < str.Length; i++) { if (match[i] != match[i - 1]) { strParts.Add(new Tuple <int, int>(start, i - 1)); start = i; } } strParts.Add(new Tuple <int, int>(start, str.Length - 1)); string ret = ""; int index = 0; foreach (var strPart in strParts) { start = strPart.Item1; end = strPart.Item2; int type = match[start]; if (type == -1) { ret += str.Substring(start, end - start + 1); for (int i = start; i <= end; i++) { positionMap.Add(index++, i); } } else { string originalText = str.Substring(start, end - start + 1); ret += replaceText; for (int i = 0; i < replaceText.Length; i++) { positionMap.Add(index++, start); } } } positionMap.Add(index++, str.Length); return(ret); }
public virtual List <ExtractResult> Extract(string source) { if (string.IsNullOrEmpty(source)) { return(new List <ExtractResult>()); } var results = new List <ExtractResult>(); var matchSource = new Dictionary <Tuple <int, int>, string>(); var matched = new bool[source.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { GetMatchedStartAndLength(m, collection.Value, source, out int start, out int length); if (start >= 0 && length > 0) { // Keep Source Data for extra information matchSource.Add(new Tuple <int, int>(start, length), collection.Value); } } } foreach (var match in matchSource) { var start = match.Key.Item1; var length = match.Key.Item2; // Filter wrong two number ranges such as "more than 20 and less than 10" and "大于20小于10". if (match.Value.Equals(NumberRangeConstants.TWONUM, StringComparison.Ordinal)) { int moreIndex = 0, lessIndex = 0; var text = source.Substring(match.Key.Item1, match.Key.Item2); var er = numberExtractor.Extract(text); if (er.Count != 2) { er = ordinalExtractor.Extract(text); if (er.Count != 2) { continue; } } var nums = er.Select(r => (double)(numberParser.Parse(r).Value ?? 0)).ToList(); // Order matchSource by decreasing match length so that "no less than x" is before "less than x" var matchList = matchSource.ToList(); matchList.Sort((pair1, pair2) => pair2.Key.Item2.CompareTo(pair1.Key.Item2)); moreIndex = matchList.First(r => r.Value.Equals(NumberRangeConstants.MORE, StringComparison.Ordinal) && r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1; lessIndex = matchList.First(r => r.Value.Equals(NumberRangeConstants.LESS, StringComparison.Ordinal) && r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1; if (!((nums[0] < nums[1] && moreIndex <= lessIndex) || (nums[0] > nums[1] && moreIndex >= lessIndex))) { continue; } } // The entity is longer than 1, so don't mark the last char to represent the end. // To avoid no connector cases like "大于20小于10" being marked as a whole entity. for (var j = 0; j < length - 1; j++) { matched[start + j] = true; } } var last = -1; for (var i = 0; i < source.Length; i++) { if (matched[i]) { if (i + 1 == source.Length || !matched[i + 1]) { var start = last + 1; var length = i - last + 1; var substr = source.Substring(start, length); if (matchSource.Keys.Any(o => o.Item1 == start && o.Item2 == length)) { var srcMatch = matchSource.Keys.First(o => o.Item1 == start && o.Item2 == length); var er = new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null, }; results.Add(er); } } } else { last = i; } } // In ExperimentalMode, cases like "from 3 to 5" and "between 10 and 15" are set to closed at both start and end if ((Config.Options & NumberOptions.ExperimentalMode) != 0) { foreach (var result in results) { var data = result.Data.ToString(); if (data == NumberRangeConstants.TWONUMBETWEEN || data == NumberRangeConstants.TWONUMTILL) { result.Data = NumberRangeConstants.TWONUMCLOSED; } } } return(results); }