Esempio n. 1
0
        public override List <ExtractResult> Extract(string text)
        {
            var result = new List <ExtractResult>();

            if (string.IsNullOrEmpty(text))
            {
                return(result);
            }

            var matchSource = new Dictionary <Match, string>();
            var matched     = new bool[text.Length];

            // Traverse every match results to see each position in the text is matched or not.
            var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value);

            foreach (var collection in collections)
            {
                for (int k = 0; k < text.Length; k++)
                {
                    matched[k] = false;
                }

                foreach (Match m in collection.Key)
                {
                    if (IsValidMatch(m))
                    {
                        for (var j = 0; j < m.Length; j++)
                        {
                            matched[m.Index + j] = true;
                        }

                        // Keep Source Data for extra information
                        matchSource.Add(m, collection.Value);
                    }
                }

                GetResult(matched, text, matchSource, result);
            }

            // Form the extracted results mark all the matched intervals in the text.
            return(PostFilter(result));
        }
Esempio n. 2
0
        public override List <ExtractResult> Extract(string text)
        {
            var result = new List <ExtractResult>();

            if (string.IsNullOrEmpty(text))
            {
                return(result);
            }

            var matchSource = new Dictionary <Match, string>();
            var matched     = new bool[text.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    for (var j = 0; j < m.Length; j++)
                    {
                        matched[m.Index + j] = true;
                    }

                    // Keep Source Data for extra information
                    matchSource.Add(m, collection.Value);
                }
            }

            var lastNotMatched = -1;

            for (var i = 0; i < text.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == text.Length || !matched[i + 1])
                    {
                        var start  = lastNotMatched + 1;
                        var length = i - lastNotMatched;
                        var substr = text.Substring(start, length);
                        if (substr.StartsWith(Constants.IPV6_ELLIPSIS) &&
                            (start > 0 && char.IsLetterOrDigit(text[start - 1]) && !SimpleTokenizer.IsCjk(text[start - 1])))
                        {
                            continue;
                        }

                        if (substr.EndsWith(Constants.IPV6_ELLIPSIS) &&
                            (i + 1 < text.Length && char.IsLetterOrDigit(text[i + 1]) && !SimpleTokenizer.IsCjk(text[start + 1])))
                        {
                            continue;
                        }

                        bool MatchFunc(Match o) => o.Index == start && o.Length == length;

                        if (matchSource.Keys.Any(MatchFunc))
                        {
                            var srcMatch = matchSource.Keys.First(MatchFunc);
                            result.Add(new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null,
                            });
                        }
                    }
                }
                else
                {
                    lastNotMatched = i;
                }
            }

            return(result);
        }
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var result      = new List <ExtractResult>();
            var matchSource = new Dictionary <Match, TypeTag>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    // In ExperimentalMode, AmbigiuousFraction like "30000 in 2009" needs to be skipped
                    if (Options == NumberOptions.ExperimentalMode && AmbiguousFractionConnectorsRegex.Match(m.Value).Success)
                    {
                        continue;
                    }

                    for (var j = 0; j < m.Length; j++)
                    {
                        matched[m.Index + j] = true;
                    }

                    // Keep Source Data for extra information
                    matchSource.Add(m, collection.Value);
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Index == start && o.Length == length))
                        {
                            var type = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length)
                                       .Select(p => (p.Value.Priority, p.Value.Name)).Min().Item2;

                            // Extract negative numbers
                            if (NegativeNumberTermsRegex != null)
                            {
                                var match = NegativeNumberTermsRegex.Match(source.Substring(0, start));
                                if (match.Success)
                                {
                                    start  = match.Index;
                                    length = length + match.Length;
                                    substr = match.Value + substr;
                                }
                            }

                            var er = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = type
                            };
                            result.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            result = FilterAmbiguity(result, source);

            return(result);
        }
Esempio n. 4
0
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var result      = new List <ExtractResult>();
            var matchSource = new Dictionary <Match, TypeTag>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    // In ExperimentalMode, AmbiguousFraction like "30000 in 2009" needs to be skipped
                    if ((Options & NumberOptions.ExperimentalMode) != 0 && AmbiguousFractionConnectorsRegex.Match(m.Value).Success)
                    {
                        continue;
                    }

                    // In EnablePreview, cases like "last", "next" should not be skipped
                    if ((Options & NumberOptions.EnablePreview) == 0 && IsRelativeOrdinal(m.Value))
                    {
                        continue;
                    }

                    for (var j = 0; j < m.Length; j++)
                    {
                        matched[m.Index + j] = true;
                    }

                    // Fliter out cases like "first two", "last one"
                    // only support in English now
                    if (ExtractType.Contains(Constants.MODEL_ORDINAL) && RelativeOrdinalFilterRegex != null && RelativeOrdinalFilterRegex.IsMatch(source))
                    {
                        continue;
                    }

                    // Keep Source Data for extra information
                    matchSource.Add(m, collection.Value);
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Index == start && o.Length == length))
                        {
                            var type = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length)
                                       .Select(p => (p.Value.Priority, p.Value.Name)).Min().Item2;

                            // Extract negative numbers
                            if (NegativeNumberTermsRegex != null)
                            {
                                var match = NegativeNumberTermsRegex.Match(source.Substring(0, start));
                                if (match.Success)
                                {
                                    start  = match.Index;
                                    length = length + match.Length;
                                    substr = match.Value + substr;
                                }
                            }

                            var er = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = type,
                            };

                            // Add Metadata information for Ordinal
                            if (ExtractType.Contains(Constants.MODEL_ORDINAL))
                            {
                                er.Metadata = new Metadata();
                                if (IsRelativeOrdinal(substr))
                                {
                                    er.Metadata.IsOrdinalRelative = true;
                                }
                            }

                            result.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            result = FilterAmbiguity(result, source);

            return(result);
        }
Esempio n. 5
0
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var results     = new List <ExtractResult>();
            var matchSource = new Dictionary <Tuple <int, int>, string>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    GetMatchedStartAndLength(m, collection.Value, source, out int start, out int length);

                    if (start >= 0 && length > 0)
                    {
                        for (var j = 0; j < length; j++)
                        {
                            matched[start + j] = true;
                        }

                        // Keep Source Data for extra information
                        matchSource.Add(new Tuple <int, int>(start, length), collection.Value);
                    }
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Item1 == start && o.Item2 == length))
                        {
                            var srcMatch = matchSource.Keys.First(o => o.Item1 == start && o.Item2 == length);
                            var er       = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null,
                            };
                            results.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            // In ExperimentalMode, cases like "from 3 to 5" and "between 10 and 15" are set to closed at both start and end
            if ((Options & NumberOptions.ExperimentalMode) != 0)
            {
                foreach (var result in results)
                {
                    if (result.Data.ToString() == NumberRangeConstants.TWONUMBETWEEN ||
                        result.Data.ToString() == NumberRangeConstants.TWONUMTILL)
                    {
                        result.Data = NumberRangeConstants.TWONUMCLOSED;
                    }
                }
            }

            return(results);
        }
Esempio n. 6
0
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var result      = new List <ExtractResult>();
            var matchSource = new Dictionary <Match, string>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    for (var j = 0; j < m.Length; j++)
                    {
                        matched[m.Index + j] = true;
                    }

                    // Keep Source Data for extra information
                    matchSource.Add(m, collection.Value);
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Index == start && o.Length == length))
                        {
                            var srcMatch = matchSource.Keys.First(o => o.Index == start && o.Length == length);
                            var er       = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null
                            };
                            result.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            return(result);
        }
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var results     = new List <ExtractResult>();
            var matchSource = new Dictionary <Tuple <int, int>, string>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    GetMatchedStartAndLength(m, collection.Value, source, out int start, out int length);

                    if (start >= 0 && length > 0)
                    {
                        // Keep Source Data for extra information
                        matchSource.Add(new Tuple <int, int>(start, length), collection.Value);
                    }
                }
            }

            foreach (var match in matchSource)
            {
                var start  = match.Key.Item1;
                var length = match.Key.Item2;

                // Filter wrong two number ranges such as "more than 20 and less than 10" and "大于20小于10".
                if (match.Value.Equals(NumberRangeConstants.TWONUM, StringComparison.Ordinal))
                {
                    int moreIndex = 0, lessIndex = 0;

                    var text = source.Substring(match.Key.Item1, match.Key.Item2);

                    var er = numberExtractor.Extract(text);

                    if (er.Count != 2)
                    {
                        er = ordinalExtractor.Extract(text);

                        if (er.Count != 2)
                        {
                            continue;
                        }
                    }

                    var nums = er.Select(r => (double)(numberParser.Parse(r).Value ?? 0)).ToList();

                    // Order matchSource by decreasing match length so that "no less than x" is before "less than x"
                    var matchList = matchSource.ToList();
                    matchList.Sort((pair1, pair2) => pair2.Key.Item2.CompareTo(pair1.Key.Item2));

                    moreIndex = matchList.First(r =>
                                                r.Value.Equals(NumberRangeConstants.MORE, StringComparison.Ordinal) &&
                                                r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1;

                    lessIndex = matchList.First(r =>
                                                r.Value.Equals(NumberRangeConstants.LESS, StringComparison.Ordinal) &&
                                                r.Key.Item1 >= start && r.Key.Item1 + r.Key.Item2 <= start + length).Key.Item1;

                    if (!((nums[0] < nums[1] && moreIndex <= lessIndex) || (nums[0] > nums[1] && moreIndex >= lessIndex)))
                    {
                        continue;
                    }
                }

                // The entity is longer than 1, so don't mark the last char to represent the end.
                // To avoid no connector cases like "大于20小于10" being marked as a whole entity.
                for (var j = 0; j < length - 1; j++)
                {
                    matched[start + j] = true;
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last + 1;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Item1 == start && o.Item2 == length))
                        {
                            var srcMatch = matchSource.Keys.First(o => o.Item1 == start && o.Item2 == length);
                            var er       = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null,
                            };

                            results.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            // In ExperimentalMode, cases like "from 3 to 5" and "between 10 and 15" are set to closed at both start and end
            if ((Config.Options & NumberOptions.ExperimentalMode) != 0)
            {
                foreach (var result in results)
                {
                    var data = result.Data.ToString();
                    if (data == NumberRangeConstants.TWONUMBETWEEN ||
                        data == NumberRangeConstants.TWONUMTILL)
                    {
                        result.Data = NumberRangeConstants.TWONUMCLOSED;
                    }
                }
            }

            return(results);
        }
Esempio n. 8
0
        public virtual List <ExtractResult> Extract(string text)
        {
            var result = new List <ExtractResult>();

            if (string.IsNullOrEmpty(text))
            {
                return(result);
            }

            var matchSource = new Dictionary <Match, string>();
            var matched     = new bool[text.Length];

            //Traverse every match results to see each position in the text is matched or not.
            var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    if (IsValidMatch(m))
                    {
                        for (var j = 0; j < m.Length; j++)
                        {
                            matched[m.Index + j] = true;
                        }

                        // Keep Source Data for extra information
                        matchSource.Add(m, collection.Value);
                    }
                }
            }

            // Form the extracted results mark all the matched intervals in the text.
            var lastNotMatched = -1;

            for (var i = 0; i < text.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == text.Length || !matched[i + 1])
                    {
                        var start  = lastNotMatched + 1;
                        var length = i - lastNotMatched;
                        var substr = text.Substring(start, length);
                        bool matchFunc(Match o) => o.Index == start && o.Length == length;

                        if (matchSource.Keys.Any(matchFunc))
                        {
                            var srcMatch = matchSource.Keys.First(matchFunc);
                            result.Add(new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null
                            });
                        }
                    }
                }
                else
                {
                    lastNotMatched = i;
                }
            }

            return(result);
        }
Esempio n. 9
0
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var result      = new List <ExtractResult>();
            var matchSource = new Dictionary <Match, TypeTag>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    // In ExperimentalMode, AmbiguousFraction like "30000 in 2009" needs to be skipped
                    if ((Options & NumberOptions.ExperimentalMode) != 0 && AmbiguousFractionConnectorsRegex.Match(m.Value).Success)
                    {
                        continue;
                    }

                    // If SuppressExtendedTypes is on, cases like "last", "next" should be skipped
                    if ((Options & NumberOptions.SuppressExtendedTypes) != 0 && m.Groups[Constants.RelativeOrdinalGroupName].Success)
                    {
                        continue;
                    }

                    // Matches containing separators 'in', 'out of' should be considered fractions only when numerator < denominator
                    if (m.Groups["ambiguousSeparator"].Success)
                    {
                        var numerator   = m.Groups["numerator"];
                        var denominator = m.Groups["denominator"];
                        int num         = ParseNumber(numerator);
                        int den         = ParseNumber(denominator);

                        if (num > den)
                        {
                            continue;
                        }
                    }

                    for (var j = 0; j < m.Length; j++)
                    {
                        matched[m.Index + j] = true;
                    }

                    // Keep Source Data for extra information
                    matchSource.Add(m, collection.Value);
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Index == start && o.Length == length))
                        {
                            var(_, type, originalMatch) = matchSource.Where(p => p.Key.Index == start && p.Key.Length == length)
                                                          .Select(p => (p.Value.Priority, p.Value.Name, p.Key)).Min();

                            // Extract negative numbers
                            if (NegativeNumberTermsRegex != null)
                            {
                                var match = NegativeNumberTermsRegex.Match(source.Substring(0, start));
                                if (match.Success)
                                {
                                    start   = match.Index;
                                    length += match.Length;
                                    substr  = match.Value + substr;
                                }
                            }

                            var er = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = type,
                            };

                            // Add Metadata information for Ordinal
                            if (ExtractType.Contains(Constants.MODEL_ORDINAL))
                            {
                                er.Metadata = new Metadata();
                                if ((Options & NumberOptions.SuppressExtendedTypes) == 0 &&
                                    originalMatch.Groups[Constants.RelativeOrdinalGroupName].Success)
                                {
                                    er.Metadata.IsOrdinalRelative = true;
                                }
                            }

                            result.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            result = FilterAmbiguity(result, source);

            return(result);
        }
        public virtual List <ExtractResult> Extract(string source)
        {
            if (string.IsNullOrEmpty(source))
            {
                return(new List <ExtractResult>());
            }

            var result      = new List <ExtractResult>();
            var matchSource = new Dictionary <Match, string>();
            var matched     = new bool[source.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(source), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    matchSource.Add(m, collection.Value);
                }
            }

            matchSource = RankMatches(matchSource);
            matchSource = RemoveRedundantMatches(matchSource);

            foreach (var match in matchSource)
            {
                for (var i = 0; i < match.Key.Length; i++)
                {
                    matched[match.Key.Index + i] = true;
                }
            }

            var last = -1;

            for (var i = 0; i < source.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == source.Length || !matched[i + 1])
                    {
                        var start  = last + 1;
                        var length = i - last;
                        var substr = source.Substring(start, length);

                        if (matchSource.Keys.Any(o => o.Index == start && o.Length == length))
                        {
                            var srcMatch = matchSource.Keys.First(o => o.Index == start && o.Length == length);

                            // Extract negative numbers
                            if (NegativeNumberTermsRegex != null)
                            {
                                var match = NegativeNumberTermsRegex.Match(source.Substring(0, start));
                                if (match.Success)
                                {
                                    start  = match.Index;
                                    length = length + match.Length;
                                    substr = match.Value + substr;
                                }
                            }

                            var er = new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null
                            };
                            result.Add(er);
                        }
                    }
                }
                else
                {
                    last = i;
                }
            }

            return(result);
        }