示例#1
0
        public void TestStringMatcher()
        {
            var utc8Value = "UTC+08:00";
            var utc8Words = new List <string>()
            {
                "beijing time", "chongqing time", "hong kong time", "urumqi time",
            };

            var utc2Value = "UTC+02:00";
            var utc2Words = new List <string>()
            {
                "cairo time", "beirut time", "gaza time", "amman time",
            };

            var valueDictionary = new Dictionary <string, List <string> >()
            {
                {
                    utc8Value, utc8Words
                },
                {
                    utc2Value, utc2Words
                },
            };

            var stringMatcher = new StringMatcher();

            stringMatcher.Init(valueDictionary);

            foreach (var value in utc8Words)
            {
                var sentence = $"please change {value}, thanks";
                var matches  = stringMatcher.Find(sentence);
                Assert.AreEqual(value, matches.Single().Text);
                Assert.AreEqual(utc8Value, matches.Single().CanonicalValues.First());
                Assert.AreEqual(14, matches.Single().Start);
            }

            foreach (var value in utc2Words)
            {
                var sentence = $"please change {value}, thanks";
                var matches  = stringMatcher.Find(sentence);
                Assert.AreEqual(value, matches.Single().Text);
                Assert.AreEqual(utc2Value, matches.Single().CanonicalValues.First());
                Assert.AreEqual(14, matches.Single().Start);
            }
        }
        public void SimpleTestStringMatcher()
        {
            var values = new List <string>()
            {
                "China", "Beijing", "City"
            };
            var stringMatcher = new StringMatcher(values);

            foreach (var value in values)
            {
                Assert.AreEqual(value, stringMatcher.Find(value).Single().Text);
            }
        }
示例#3
0
        // Temporary solution for remove superfluous words only under the Preview mode
        public static string PreProcessTextRemoveSuperfluousWords(string text, StringMatcher matcher, out List <MatchResult <string> > superfluousWordMatches)
        {
            superfluousWordMatches = matcher.Find(text).ToList();
            var bias = 0;

            foreach (var match in superfluousWordMatches)
            {
                text  = text.Remove(match.Start - bias, match.Length);
                bias += match.Length;
            }

            return(text);
        }
        public void SimpleTestWithIdsStringMatcher()
        {
            var values = new List <string>()
            {
                "China", "Beijing", "City"
            };
            var Ids = new List <string>()
            {
                "1", "2", "3"
            };
            var stringMatcher = new StringMatcher(values, Ids.ToArray());

            for (var i = 0; i < values.Count; i++)
            {
                var value = values[i];
                var match = stringMatcher.Find(value).Single();
                Assert.AreEqual(value, match.Text);
                Assert.AreEqual(Ids[i], match.Values.First());
            }
        }
        public List <ExtractResult> Extract(string source)
        {
            var result = new List <ExtractResult>();

            if (!PreCheckStr(source))
            {
                return(result);
            }

            var mappingPrefix = new Dictionary <int, PrefixUnitResult>();
            var sourceLen     = source.Length;
            var prefixMatched = false;

            MatchCollection nonUnitMatches = null;
            var             prefixMatch    = prefixMatcher.Find(source).OrderBy(o => o.Start).ToList();
            var             suffixMatch    = suffixMatcher.Find(source).OrderBy(o => o.Start).ToList();

            if (prefixMatch.Count > 0 || suffixMatch.Count > 0)
            {
                var numbers = this.config.UnitNumExtractor.Extract(source).OrderBy(o => o.Start);

                // Special case for cases where number multipliers clash with unit
                var ambiguousMultiplierRegex = this.config.AmbiguousUnitNumberMultiplierRegex;
                if (ambiguousMultiplierRegex != null)
                {
                    foreach (var number in numbers)
                    {
                        var match = ambiguousMultiplierRegex.Matches(number.Text);
                        if (match.Count == 1)
                        {
                            var newLength = number.Text.Length - match[0].Length;
                            number.Text   = number.Text.Substring(0, newLength);
                            number.Length = newLength;
                        }
                    }
                }

                foreach (var number in numbers)
                {
                    if (number.Start == null || number.Length == null)
                    {
                        continue;
                    }

                    int start = (int)number.Start, length = (int)number.Length;
                    var maxFindPref = Math.Min(maxPrefixMatchLen, number.Start.Value);
                    var maxFindSuff = sourceLen - start - length;

                    if (maxFindPref != 0)
                    {
                        // Scan from left to right, find the longest match
                        var lastIndex = start;
                        MatchResult <string> bestMatch = null;

                        foreach (var m in prefixMatch)
                        {
                            if (m.Length > 0 && m.End > start)
                            {
                                break;
                            }

                            if (m.Length > 0 && source.Substring(m.Start, lastIndex - m.Start).Trim() == m.Text)
                            {
                                bestMatch = m;
                                break;
                            }
                        }

                        if (bestMatch != null)
                        {
                            var offSet  = lastIndex - bestMatch.Start;
                            var unitStr = source.Substring(bestMatch.Start, offSet);
                            mappingPrefix.Add(number.Start.Value, new PrefixUnitResult {
                                Offset = offSet, UnitStr = unitStr
                            });
                        }
                    }

                    mappingPrefix.TryGetValue(start, out PrefixUnitResult prefixUnit);
                    if (maxFindSuff > 0)
                    {
                        // find the best suffix unit
                        var maxlen     = 0;
                        var firstIndex = start + length;

                        foreach (var m in suffixMatch)
                        {
                            if (m.Length > 0 && m.Start >= firstIndex)
                            {
                                var endpos = m.Start + m.Length - firstIndex;
                                if (maxlen < endpos)
                                {
                                    var midStr = source.Substring(firstIndex, m.Start - firstIndex);
                                    if (string.IsNullOrWhiteSpace(midStr) || midStr.Trim().Equals(this.config.ConnectorToken))
                                    {
                                        maxlen = endpos;
                                    }
                                }
                            }
                        }

                        if (maxlen != 0)
                        {
                            var substr = source.Substring(start, length + maxlen);
                            var er     = new ExtractResult
                            {
                                Start  = start,
                                Length = length + maxlen,
                                Text   = substr,
                                Type   = this.config.ExtractType,
                            };

                            if (prefixUnit != null)
                            {
                                prefixMatched = true;
                                er.Start     -= prefixUnit.Offset;
                                er.Length    += prefixUnit.Offset;
                                er.Text       = prefixUnit.UnitStr + er.Text;
                            }

                            // Relative position will be used in Parser
                            number.Start = start - er.Start;
                            er.Data      = number;

                            // Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension
                            var isNotUnit = false;
                            if (er.Type.Equals(Constants.SYS_UNIT_DIMENSION, StringComparison.Ordinal))
                            {
                                if (nonUnitMatches == null)
                                {
                                    nonUnitMatches = this.config.NonUnitRegex.Matches(source);
                                }

                                foreach (Match time in nonUnitMatches)
                                {
                                    if (er.Start >= time.Index && er.Start + er.Length <= time.Index + time.Length)
                                    {
                                        isNotUnit = true;
                                        break;
                                    }
                                }
                            }

                            if (isNotUnit)
                            {
                                continue;
                            }

                            result.Add(er);
                        }
                    }

                    if (prefixUnit != null && !prefixMatched)
                    {
                        var er = new ExtractResult
                        {
                            Start  = number.Start - prefixUnit.Offset,
                            Length = number.Length + prefixUnit.Offset,
                            Text   = prefixUnit.UnitStr + number.Text,
                            Type   = this.config.ExtractType,
                        };

                        // Relative position will be used in Parser
                        number.Start = start - er.Start;
                        er.Data      = number;
                        result.Add(er);
                    }
                }
            }

            // Extract Separate unit
            if (separateRegex != null)
            {
                if (nonUnitMatches == null)
                {
                    nonUnitMatches = this.config.NonUnitRegex.Matches(source);
                }

                ExtractSeparateUnits(source, result, nonUnitMatches);

                // Remove common ambiguous cases
                result = FilterAmbiguity(result, source);
            }

            return(result);
        }
示例#6
0
        public List <ExtractResult> Extract(string source)
        {
            var result = new List <ExtractResult>();

            if (!PreCheckStr(source))
            {
                return(result);
            }

            var mappingPrefix = new Dictionary <int, PrefixUnitResult>();
            var sourceLen     = source.Length;
            var prefixMatched = false;
            var unitIsPrefix  = new List <bool>();

            MatchCollection nonUnitMatches = null;
            var             prefixMatches  = prefixMatcher.Find(source).OrderBy(o => o.Start).ToList();
            var             suffixMatches  = suffixMatcher.Find(source).OrderBy(o => o.Start).ToList();

            if (prefixMatches.Count > 0 || suffixMatches.Count > 0)
            {
                var numbers = this.config.UnitNumExtractor.Extract(source).OrderBy(o => o.Start);

                // Checking if there are conflicting interpretations between currency unit as prefix and suffix for each number.
                // For example, in Chinese, "$20,300美圆" should be broken into two entities instead of treating 20,300 as one number: "$20" and "300美圆".
                if (numbers.Count() > 0 && CheckExtractorType(Constants.SYS_UNIT_CURRENCY) && prefixMatches.Count() > 0 && suffixMatches.Count() > 0)
                {
                    foreach (var number in numbers)
                    {
                        int start = (int)number.Start, length = (int)number.Length;
                        var numberPrefix = prefixMatches.Any(o => o.Start + o.Length == number.Start);
                        var numberSuffix = suffixMatches.Any(o => o.Start == number.Start + number.Length);

                        if (numberPrefix != false && numberSuffix != false && number.Text.Contains(","))
                        {
                            int commaIndex = (int)number.Start + number.Text.IndexOf(",");
                            source = source.Substring(0, commaIndex) + " " + source.Substring(commaIndex + 1);
                        }
                    }

                    numbers = this.config.UnitNumExtractor.Extract(source).OrderBy(o => o.Start);
                }

                // Special case for cases where number multipliers clash with unit
                var ambiguousMultiplierRegex = this.config.AmbiguousUnitNumberMultiplierRegex;
                if (ambiguousMultiplierRegex != null)
                {
                    foreach (var number in numbers)
                    {
                        var match = ambiguousMultiplierRegex.Matches(number.Text);
                        if (match.Count == 1)
                        {
                            var newLength = number.Text.Length - match[0].Length;
                            number.Text   = number.Text.Substring(0, newLength);
                            number.Length = newLength;
                        }
                    }
                }

                foreach (var number in numbers)
                {
                    if (number.Start == null || number.Length == null)
                    {
                        continue;
                    }

                    int start = (int)number.Start, length = (int)number.Length;
                    var maxFindPref = Math.Min(maxPrefixMatchLen, number.Start.Value);
                    var maxFindSuff = sourceLen - start - length;

                    var closeMatch = false;
                    if (maxFindPref != 0)
                    {
                        // Scan from left to right, find the longest match
                        var lastIndex = start;
                        MatchResult <string> bestMatch = null;

                        foreach (var m in prefixMatches)
                        {
                            if (m.Length > 0 && m.End > start)
                            {
                                break;
                            }

                            var unitStr = source.Substring(m.Start, lastIndex - m.Start);
                            if (m.Length > 0 && unitStr.Trim() == m.Text)
                            {
                                if (unitStr == m.Text)
                                {
                                    closeMatch = true;
                                }

                                bestMatch = m;
                                break;
                            }
                        }

                        if (bestMatch != null)
                        {
                            var offSet  = lastIndex - bestMatch.Start;
                            var unitStr = source.Substring(bestMatch.Start, offSet);
                            mappingPrefix[number.Start.Value] = new PrefixUnitResult {
                                Offset = offSet, UnitStr = unitStr
                            };
                        }
                    }

                    mappingPrefix.TryGetValue(start, out PrefixUnitResult prefixUnit);

                    // For currency unit, such as "$ 10 $ 20", get candidate "$ 10" "10 $" "$20" then select to get result.
                    // So add "$ 10" to result here, then get "10 $" in the suffixMatch.
                    // But for case like "摄氏温度10度", "摄氏温度10" will skip this and continue to extend the suffix.
                    if (prefixUnit != null && !prefixMatched && CheckExtractorType(Constants.SYS_UNIT_CURRENCY))
                    {
                        var er = new ExtractResult
                        {
                            Start  = number.Start - prefixUnit.Offset,
                            Length = number.Length + prefixUnit.Offset,
                            Text   = prefixUnit.UnitStr + number.Text,
                            Type   = this.config.ExtractType,
                        };

                        // Relative position will be used in Parser
                        var numberData = number.Clone();
                        numberData.Start = start - er.Start;
                        er.Data          = numberData;

                        result.Add(er);
                        unitIsPrefix.Add(true);
                    }

                    if (maxFindSuff > 0)
                    {
                        // If the number already get close prefix currency unit, skip the suffix match.
                        if (CheckExtractorType(Constants.SYS_UNIT_CURRENCY) && closeMatch)
                        {
                            continue;
                        }

                        // find the best suffix unit
                        var maxlen     = 0;
                        var firstIndex = start + length;

                        foreach (var m in suffixMatches)
                        {
                            if (m.Length > 0 && m.Start >= firstIndex)
                            {
                                var endpos = m.Start + m.Length - firstIndex;
                                if (maxlen < endpos)
                                {
                                    var midStr = source.Substring(firstIndex, m.Start - firstIndex);
                                    if (string.IsNullOrWhiteSpace(midStr) || midStr.Trim().Equals(this.config.ConnectorToken, StringComparison.Ordinal))
                                    {
                                        maxlen = endpos;
                                    }
                                }
                            }
                        }

                        if (maxlen != 0)
                        {
                            var substr = source.Substring(start, length + maxlen);

                            var er = new ExtractResult
                            {
                                Start  = start,
                                Length = length + maxlen,
                                Text   = substr,
                                Type   = this.config.ExtractType,
                            };

                            if (prefixUnit != null && !CheckExtractorType(Constants.SYS_UNIT_CURRENCY))
                            {
                                prefixMatched = true;
                                er.Start     -= prefixUnit.Offset;
                                er.Length    += prefixUnit.Offset;
                                er.Text       = prefixUnit.UnitStr + er.Text;
                            }

                            // Relative position will be used in Parser
                            var numberData = number.Clone();
                            numberData.Start = start - er.Start;
                            er.Data          = numberData;

                            // Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension
                            var isNotUnit = false;
                            if (er.Type.Equals(Constants.SYS_UNIT_DIMENSION, StringComparison.Ordinal))
                            {
                                if (nonUnitMatches == null)
                                {
                                    nonUnitMatches = this.config.NonUnitRegex.Matches(source);
                                }

                                foreach (Match time in nonUnitMatches)
                                {
                                    if (er.Start >= time.Index && er.Start + er.Length <= time.Index + time.Length)
                                    {
                                        isNotUnit = true;
                                        break;
                                    }
                                }
                            }

                            if (isNotUnit)
                            {
                                continue;
                            }

                            result.Add(er);
                            unitIsPrefix.Add(false);
                        }
                    }

                    if (prefixUnit != null && !prefixMatched && !CheckExtractorType(Constants.SYS_UNIT_CURRENCY))
                    {
                        var er = new ExtractResult
                        {
                            Start  = number.Start - prefixUnit.Offset,
                            Length = number.Length + prefixUnit.Offset,
                            Text   = prefixUnit.UnitStr + number.Text,
                            Type   = this.config.ExtractType,
                        };

                        // Relative position will be used in Parser
                        var numberData = number.Clone();
                        numberData.Start = start - er.Start;
                        er.Data          = numberData;

                        result.Add(er);
                    }
                }
            }

            // Extract Separate unit
            if (separateRegex != null)
            {
                if (nonUnitMatches == null)
                {
                    nonUnitMatches = this.config.NonUnitRegex.Matches(source);
                }

                ExtractSeparateUnits(source, result, nonUnitMatches);
            }

            // Remove common ambiguous cases
            result = FilterAmbiguity(result, source);

            if (CheckExtractorType(Constants.SYS_UNIT_CURRENCY))
            {
                result = SelectCandidates(source, result, unitIsPrefix);
            }

            return(result);
        }