public void TestStringMatcher() { var utc8Value = "UTC+08:00"; var utc8Words = new List <string>() { "beijing time", "chongqing time", "hong kong time", "urumqi time", }; var utc2Value = "UTC+02:00"; var utc2Words = new List <string>() { "cairo time", "beirut time", "gaza time", "amman time", }; var valueDictionary = new Dictionary <string, List <string> >() { { utc8Value, utc8Words }, { utc2Value, utc2Words }, }; var stringMatcher = new StringMatcher(); stringMatcher.Init(valueDictionary); foreach (var value in utc8Words) { var sentence = $"please change {value}, thanks"; var matches = stringMatcher.Find(sentence); Assert.AreEqual(value, matches.Single().Text); Assert.AreEqual(utc8Value, matches.Single().CanonicalValues.First()); Assert.AreEqual(14, matches.Single().Start); } foreach (var value in utc2Words) { var sentence = $"please change {value}, thanks"; var matches = stringMatcher.Find(sentence); Assert.AreEqual(value, matches.Single().Text); Assert.AreEqual(utc2Value, matches.Single().CanonicalValues.First()); Assert.AreEqual(14, matches.Single().Start); } }
public void SimpleTestStringMatcher() { var values = new List <string>() { "China", "Beijing", "City" }; var stringMatcher = new StringMatcher(values); foreach (var value in values) { Assert.AreEqual(value, stringMatcher.Find(value).Single().Text); } }
// Temporary solution for remove superfluous words only under the Preview mode public static string PreProcessTextRemoveSuperfluousWords(string text, StringMatcher matcher, out List <MatchResult <string> > superfluousWordMatches) { superfluousWordMatches = matcher.Find(text).ToList(); var bias = 0; foreach (var match in superfluousWordMatches) { text = text.Remove(match.Start - bias, match.Length); bias += match.Length; } return(text); }
public void SimpleTestWithIdsStringMatcher() { var values = new List <string>() { "China", "Beijing", "City" }; var Ids = new List <string>() { "1", "2", "3" }; var stringMatcher = new StringMatcher(values, Ids.ToArray()); for (var i = 0; i < values.Count; i++) { var value = values[i]; var match = stringMatcher.Find(value).Single(); Assert.AreEqual(value, match.Text); Assert.AreEqual(Ids[i], match.Values.First()); } }
public List <ExtractResult> Extract(string source) { var result = new List <ExtractResult>(); if (!PreCheckStr(source)) { return(result); } var mappingPrefix = new Dictionary <int, PrefixUnitResult>(); var sourceLen = source.Length; var prefixMatched = false; MatchCollection nonUnitMatches = null; var prefixMatch = prefixMatcher.Find(source).OrderBy(o => o.Start).ToList(); var suffixMatch = suffixMatcher.Find(source).OrderBy(o => o.Start).ToList(); if (prefixMatch.Count > 0 || suffixMatch.Count > 0) { var numbers = this.config.UnitNumExtractor.Extract(source).OrderBy(o => o.Start); // Special case for cases where number multipliers clash with unit var ambiguousMultiplierRegex = this.config.AmbiguousUnitNumberMultiplierRegex; if (ambiguousMultiplierRegex != null) { foreach (var number in numbers) { var match = ambiguousMultiplierRegex.Matches(number.Text); if (match.Count == 1) { var newLength = number.Text.Length - match[0].Length; number.Text = number.Text.Substring(0, newLength); number.Length = newLength; } } } foreach (var number in numbers) { if (number.Start == null || number.Length == null) { continue; } int start = (int)number.Start, length = (int)number.Length; var maxFindPref = Math.Min(maxPrefixMatchLen, number.Start.Value); var maxFindSuff = sourceLen - start - length; if (maxFindPref != 0) { // Scan from left to right, find the longest match var lastIndex = start; MatchResult <string> bestMatch = null; foreach (var m in prefixMatch) { if (m.Length > 0 && m.End > start) { break; } if (m.Length > 0 && source.Substring(m.Start, lastIndex - m.Start).Trim() == m.Text) { bestMatch = m; break; } } if (bestMatch != null) { var offSet = lastIndex - bestMatch.Start; var unitStr = source.Substring(bestMatch.Start, offSet); mappingPrefix.Add(number.Start.Value, new PrefixUnitResult { Offset = offSet, UnitStr = unitStr }); } } mappingPrefix.TryGetValue(start, out PrefixUnitResult prefixUnit); if (maxFindSuff > 0) { // find the best suffix unit var maxlen = 0; var firstIndex = start + length; foreach (var m in suffixMatch) { if (m.Length > 0 && m.Start >= firstIndex) { var endpos = m.Start + m.Length - firstIndex; if (maxlen < endpos) { var midStr = source.Substring(firstIndex, m.Start - firstIndex); if (string.IsNullOrWhiteSpace(midStr) || midStr.Trim().Equals(this.config.ConnectorToken)) { maxlen = endpos; } } } } if (maxlen != 0) { var substr = source.Substring(start, length + maxlen); var er = new ExtractResult { Start = start, Length = length + maxlen, Text = substr, Type = this.config.ExtractType, }; if (prefixUnit != null) { prefixMatched = true; er.Start -= prefixUnit.Offset; er.Length += prefixUnit.Offset; er.Text = prefixUnit.UnitStr + er.Text; } // Relative position will be used in Parser number.Start = start - er.Start; er.Data = number; // Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension var isNotUnit = false; if (er.Type.Equals(Constants.SYS_UNIT_DIMENSION, StringComparison.Ordinal)) { if (nonUnitMatches == null) { nonUnitMatches = this.config.NonUnitRegex.Matches(source); } foreach (Match time in nonUnitMatches) { if (er.Start >= time.Index && er.Start + er.Length <= time.Index + time.Length) { isNotUnit = true; break; } } } if (isNotUnit) { continue; } result.Add(er); } } if (prefixUnit != null && !prefixMatched) { var er = new ExtractResult { Start = number.Start - prefixUnit.Offset, Length = number.Length + prefixUnit.Offset, Text = prefixUnit.UnitStr + number.Text, Type = this.config.ExtractType, }; // Relative position will be used in Parser number.Start = start - er.Start; er.Data = number; result.Add(er); } } } // Extract Separate unit if (separateRegex != null) { if (nonUnitMatches == null) { nonUnitMatches = this.config.NonUnitRegex.Matches(source); } ExtractSeparateUnits(source, result, nonUnitMatches); // Remove common ambiguous cases result = FilterAmbiguity(result, source); } return(result); }
public List <ExtractResult> Extract(string source) { var result = new List <ExtractResult>(); if (!PreCheckStr(source)) { return(result); } var mappingPrefix = new Dictionary <int, PrefixUnitResult>(); var sourceLen = source.Length; var prefixMatched = false; var unitIsPrefix = new List <bool>(); MatchCollection nonUnitMatches = null; var prefixMatches = prefixMatcher.Find(source).OrderBy(o => o.Start).ToList(); var suffixMatches = suffixMatcher.Find(source).OrderBy(o => o.Start).ToList(); if (prefixMatches.Count > 0 || suffixMatches.Count > 0) { var numbers = this.config.UnitNumExtractor.Extract(source).OrderBy(o => o.Start); // Checking if there are conflicting interpretations between currency unit as prefix and suffix for each number. // For example, in Chinese, "$20,300美圆" should be broken into two entities instead of treating 20,300 as one number: "$20" and "300美圆". if (numbers.Count() > 0 && CheckExtractorType(Constants.SYS_UNIT_CURRENCY) && prefixMatches.Count() > 0 && suffixMatches.Count() > 0) { foreach (var number in numbers) { int start = (int)number.Start, length = (int)number.Length; var numberPrefix = prefixMatches.Any(o => o.Start + o.Length == number.Start); var numberSuffix = suffixMatches.Any(o => o.Start == number.Start + number.Length); if (numberPrefix != false && numberSuffix != false && number.Text.Contains(",")) { int commaIndex = (int)number.Start + number.Text.IndexOf(","); source = source.Substring(0, commaIndex) + " " + source.Substring(commaIndex + 1); } } numbers = this.config.UnitNumExtractor.Extract(source).OrderBy(o => o.Start); } // Special case for cases where number multipliers clash with unit var ambiguousMultiplierRegex = this.config.AmbiguousUnitNumberMultiplierRegex; if (ambiguousMultiplierRegex != null) { foreach (var number in numbers) { var match = ambiguousMultiplierRegex.Matches(number.Text); if (match.Count == 1) { var newLength = number.Text.Length - match[0].Length; number.Text = number.Text.Substring(0, newLength); number.Length = newLength; } } } foreach (var number in numbers) { if (number.Start == null || number.Length == null) { continue; } int start = (int)number.Start, length = (int)number.Length; var maxFindPref = Math.Min(maxPrefixMatchLen, number.Start.Value); var maxFindSuff = sourceLen - start - length; var closeMatch = false; if (maxFindPref != 0) { // Scan from left to right, find the longest match var lastIndex = start; MatchResult <string> bestMatch = null; foreach (var m in prefixMatches) { if (m.Length > 0 && m.End > start) { break; } var unitStr = source.Substring(m.Start, lastIndex - m.Start); if (m.Length > 0 && unitStr.Trim() == m.Text) { if (unitStr == m.Text) { closeMatch = true; } bestMatch = m; break; } } if (bestMatch != null) { var offSet = lastIndex - bestMatch.Start; var unitStr = source.Substring(bestMatch.Start, offSet); mappingPrefix[number.Start.Value] = new PrefixUnitResult { Offset = offSet, UnitStr = unitStr }; } } mappingPrefix.TryGetValue(start, out PrefixUnitResult prefixUnit); // For currency unit, such as "$ 10 $ 20", get candidate "$ 10" "10 $" "$20" then select to get result. // So add "$ 10" to result here, then get "10 $" in the suffixMatch. // But for case like "摄氏温度10度", "摄氏温度10" will skip this and continue to extend the suffix. if (prefixUnit != null && !prefixMatched && CheckExtractorType(Constants.SYS_UNIT_CURRENCY)) { var er = new ExtractResult { Start = number.Start - prefixUnit.Offset, Length = number.Length + prefixUnit.Offset, Text = prefixUnit.UnitStr + number.Text, Type = this.config.ExtractType, }; // Relative position will be used in Parser var numberData = number.Clone(); numberData.Start = start - er.Start; er.Data = numberData; result.Add(er); unitIsPrefix.Add(true); } if (maxFindSuff > 0) { // If the number already get close prefix currency unit, skip the suffix match. if (CheckExtractorType(Constants.SYS_UNIT_CURRENCY) && closeMatch) { continue; } // find the best suffix unit var maxlen = 0; var firstIndex = start + length; foreach (var m in suffixMatches) { if (m.Length > 0 && m.Start >= firstIndex) { var endpos = m.Start + m.Length - firstIndex; if (maxlen < endpos) { var midStr = source.Substring(firstIndex, m.Start - firstIndex); if (string.IsNullOrWhiteSpace(midStr) || midStr.Trim().Equals(this.config.ConnectorToken, StringComparison.Ordinal)) { maxlen = endpos; } } } } if (maxlen != 0) { var substr = source.Substring(start, length + maxlen); var er = new ExtractResult { Start = start, Length = length + maxlen, Text = substr, Type = this.config.ExtractType, }; if (prefixUnit != null && !CheckExtractorType(Constants.SYS_UNIT_CURRENCY)) { prefixMatched = true; er.Start -= prefixUnit.Offset; er.Length += prefixUnit.Offset; er.Text = prefixUnit.UnitStr + er.Text; } // Relative position will be used in Parser var numberData = number.Clone(); numberData.Start = start - er.Start; er.Data = numberData; // Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension var isNotUnit = false; if (er.Type.Equals(Constants.SYS_UNIT_DIMENSION, StringComparison.Ordinal)) { if (nonUnitMatches == null) { nonUnitMatches = this.config.NonUnitRegex.Matches(source); } foreach (Match time in nonUnitMatches) { if (er.Start >= time.Index && er.Start + er.Length <= time.Index + time.Length) { isNotUnit = true; break; } } } if (isNotUnit) { continue; } result.Add(er); unitIsPrefix.Add(false); } } if (prefixUnit != null && !prefixMatched && !CheckExtractorType(Constants.SYS_UNIT_CURRENCY)) { var er = new ExtractResult { Start = number.Start - prefixUnit.Offset, Length = number.Length + prefixUnit.Offset, Text = prefixUnit.UnitStr + number.Text, Type = this.config.ExtractType, }; // Relative position will be used in Parser var numberData = number.Clone(); numberData.Start = start - er.Start; er.Data = numberData; result.Add(er); } } } // Extract Separate unit if (separateRegex != null) { if (nonUnitMatches == null) { nonUnitMatches = this.config.NonUnitRegex.Matches(source); } ExtractSeparateUnits(source, result, nonUnitMatches); } // Remove common ambiguous cases result = FilterAmbiguity(result, source); if (CheckExtractorType(Constants.SYS_UNIT_CURRENCY)) { result = SelectCandidates(source, result, unitIsPrefix); } return(result); }