Example #1
0
        // TODO: this should not be in the NumberRangeExtractor as it doesn't handle duration concepts
        private List <ExtractResult> ExtractNumberAndOrdinalFromStr(string numberStr, bool isAmbiguousRangeOrFraction = false)
        {
            List <ExtractResult> ret = null;
            var extractNumber        = numberExtractor.Extract(numberStr);
            var extractOrdinal       = ordinalExtractor.Extract(numberStr);

            if (extractNumber.Count == 0)
            {
                ret = extractOrdinal.Count == 0 ? null : extractOrdinal;
            }
            else if (extractOrdinal.Count == 0)
            {
                ret = extractNumber;
            }
            else
            {
                ret = new List <ExtractResult>();
                ret.AddRange(extractNumber);
                ret.AddRange(extractOrdinal);
                ret = ret.OrderByDescending(num => num.Length).ThenByDescending(num => num.Start).ToList();
            }

            var removeFractionWithInConnector = ShouldRemoveFractionWithInConnector(numberStr);

            if (ret != null && (removeFractionWithInConnector || isAmbiguousRangeOrFraction))
            {
                ret = RemoveAmbiguousFractions(ret);
            }

            return(ret);
        }
Example #2
0
        private List <ExtractResult> ExtractNumberAndOrdinalFromStr(string numberStr)
        {
            var extractNumber  = numberExtractor.Extract(numberStr);
            var extractOrdinal = ordinalExtractor.Extract(numberStr);

            if (extractNumber.Count == 0)
            {
                return(extractOrdinal.Count == 0 ? null : extractOrdinal);
            }

            if (extractOrdinal.Count == 0)
            {
                return(extractNumber);
            }

            extractNumber.AddRange(extractOrdinal);
            extractNumber = extractNumber.OrderByDescending(num => num.Length).ThenByDescending(num => num.Start).ToList();
            return(extractNumber);
        }
Example #3
0
        /// <summary>
        /// get the number extractor results and convert the extracted numbers to @sys.num, so that the regexes can work.
        /// </summary>
        /// <param name="str">sentence to process.</param>
        /// <param name="positionMap">position Map.</param>
        /// <param name="numExtResults">number extractor result.</param>
        /// <returns>return according type "builtin.num" or "builtin.num.percentage".</returns>
        private string PreprocessStrWithNumberExtracted(
            string str,
            out Dictionary <int, int> positionMap,
            out IList <ExtractResult> numExtResults)
        {
            positionMap = new Dictionary <int, int>();

            numExtResults = numberExtractor.Extract(str);
            bool percentModeEnabled = (Options & NumberOptions.PercentageMode) != 0;

            // @TODO potential cause of GC
            var match = new int[str.Length];
            var strParts = new List <Tuple <int, int> >();
            int start, end;

            for (int i = 0; i < str.Length; i++)
            {
                match[i] = 0;
            }

            for (int i = 0; i < numExtResults.Count; i++)
            {
                var extraction = numExtResults[i];
                start = (int)extraction.Start;
                end   = (int)extraction.Length + start;
                for (var j = start; j < end; j++)
                {
                    if (match[j] == 0)
                    {
                        if (percentModeEnabled && extraction.Data.ToString().StartsWith("Frac", StringComparison.Ordinal))
                        {
                            match[j] = -(i + 1);
                        }
                        else
                        {
                            match[j] = i + 1;
                        }
                    }
                }
            }

            start = 0;
            for (int i = 1; i < str.Length; i++)
            {
                if (match[i] != match[i - 1])
                {
                    strParts.Add(new Tuple <int, int>(start, i - 1));
                    start = i;
                }
            }

            strParts.Add(new Tuple <int, int>(start, str.Length - 1));

            string ret   = string.Empty;
            int    index = 0;

            foreach (var strPart in strParts)
            {
                start = strPart.Item1;
                end   = strPart.Item2;
                int type = match[start];

                if (type == 0)
                {
                    // subsequence which won't be extracted
                    ret += str.Substring(start, end - start + 1);
                    for (int i = start; i <= end; i++)
                    {
                        positionMap.Add(index++, i);
                    }
                }
                else
                {
                    // subsequence which will be extracted as number, type is negative for fraction number extraction
                    var replaceText = type > 0 ? NumberPlaceHolder : FractionPlaceHolder;
                    ret += replaceText;
                    for (int i = 0; i < replaceText.Length; i++)
                    {
                        positionMap.Add(index++, start);
                    }
                }
            }

            positionMap.Add(index, str.Length);

            return(ret);
        }
        /// <summary>
        /// get the number extractor results and convert the extracted numbers to @sys.num, so that the regexes can work
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        private string PreprocessStrWithNumberExtracted(string str, out Dictionary <int, int> positionMap, out IList <ExtractResult> numExtResults)
        {
            positionMap = new Dictionary <int, int>();

            numExtResults = numberExtractor.Extract(str);
            string replaceText = "@" + NumExtType;

            //@TODO pontential cause of GC
            int[] match = new int[str.Length];
            List <Tuple <int, int> > strParts = new List <Tuple <int, int> >();
            int start, end;

            for (int i = 0; i < str.Length; i++)
            {
                match[i] = -1;
            }

            for (int i = 0; i < numExtResults.Count; i++)
            {
                var    extraction = numExtResults[i];
                string subtext    = extraction.Text;
                start = (int)extraction.Start;
                end   = (int)extraction.Length + start;
                for (int j = start; j < end; j++)
                {
                    if (match[j] == -1)
                    {
                        match[j] = i;
                    }
                }
            }

            start = 0;
            for (int i = 1; i < str.Length; i++)
            {
                if (match[i] != match[i - 1])
                {
                    strParts.Add(new Tuple <int, int>(start, i - 1));
                    start = i;
                }
            }
            strParts.Add(new Tuple <int, int>(start, str.Length - 1));

            string ret = "";
            int    index = 0;

            foreach (var strPart in strParts)
            {
                start = strPart.Item1;
                end   = strPart.Item2;
                int type = match[start];
                if (type == -1)
                {
                    ret += str.Substring(start, end - start + 1);
                    for (int i = start; i <= end; i++)
                    {
                        positionMap.Add(index++, i);
                    }
                }
                else
                {
                    string originalText = str.Substring(start, end - start + 1);
                    ret += replaceText;
                    for (int i = 0; i < replaceText.Length; i++)
                    {
                        positionMap.Add(index++, start);
                    }
                }
            }

            positionMap.Add(index++, str.Length);

            return(ret);
        }
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var results     = new List <ExtractResult>();
            var matchSource = new Dictionary <Tuple <int, int>, string>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    GetMatchedStartAndLength(m, collection.Value, source, out int start, out int length);

                    if (start >= 0 && length > 0)
                    {
                        // Keep Source Data for extra information
                        matchSource.Add(new Tuple <int, int>(start, length), collection.Value);
                    }
                }
            }

            foreach (var match in matchSource)
            {
                var start  = match.Key.Item1;
                var length = match.Key.Item2;

                // Filter wrong two number ranges such as "more than 20 and less than 10" and "大于20小于10".
                if (match.Value.Equals(NumberRangeConstants.TWONUM, StringComparison.Ordinal))
                {
                    int moreIndex = 0, lessIndex = 0;

                    var text = source.Substring(match.Key.Item1, match.Key.Item2);

                    var er = numberExtractor.Extract(text);

                    if (er.Count != 2)
                    {
                        er = ordinalExtractor.Extract(text);

                        if (er.Count != 2)
                        {
                            continue;
                        }
                    }

                    var nums = er.Select(r => (double)(numberParser.Parse(r).Value ?? 0)).ToList();

                    // Order matchSource by decreasing match length so that "no less than x" is before "less than x"
                    var matchList = matchSource.ToList();
                    matchList.Sort((pair1, pair2) => pair2.Key.Item2.CompareTo(pair1.Key.Item2));

                    moreIndex = matchList.First(r =>
                                                r.Value.Equals(NumberRangeConstants.MORE, StringComparison.Ordinal) &&
                                                r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1;

                    lessIndex = matchList.First(r =>
                                                r.Value.Equals(NumberRangeConstants.LESS, StringComparison.Ordinal) &&
                                                r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1;

                    if (!((nums[0] < nums[1] && moreIndex <= lessIndex) || (nums[0] > nums[1] && moreIndex >= lessIndex)))
                    {
                        continue;
                    }
                }

                // The entity is longer than 1, so don't mark the last char to represent the end.
                // To avoid no connector cases like "大于20小于10" being marked as a whole entity.
                for (var j = 0; j < length - 1; j++)
                {
                    matched[start + j] = true;
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last + 1;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Item1 == start && o.Item2 == length))
                        {
                            var srcMatch = matchSource.Keys.First(o => o.Item1 == start && o.Item2 == length);
                            var er       = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null,
                            };

                            results.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            // In ExperimentalMode, cases like "from 3 to 5" and "between 10 and 15" are set to closed at both start and end
            if ((Config.Options & NumberOptions.ExperimentalMode) != 0)
            {
                foreach (var result in results)
                {
                    var data = result.Data.ToString();
                    if (data == NumberRangeConstants.TWONUMBETWEEN ||
                        data == NumberRangeConstants.TWONUMTILL)
                    {
                        result.Data = NumberRangeConstants.TWONUMCLOSED;
                    }
                }
            }

            return(results);
        }