public override List <ExtractResult> Extract(string text) { var result = new List <ExtractResult>(); if (string.IsNullOrEmpty(text)) { return(result); } var matchSource = new Dictionary <Match, string>(); var matched = new bool[text.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var lastNotMatched = -1; for (var i = 0; i < text.Length; i++) { if (matched[i]) { if (i + 1 == text.Length || !matched[i + 1]) { var start = lastNotMatched + 1; var length = i - lastNotMatched; var substr = text.Substring(start, length); if (substr.StartsWith(Constants.IPV6_ELLIPSIS) && (start > 0 && char.IsLetterOrDigit(text[start - 1]) && !SimpleTokenizer.IsCjk(text[start - 1]))) { continue; } if (substr.EndsWith(Constants.IPV6_ELLIPSIS) && (i + 1 < text.Length && char.IsLetterOrDigit(text[i + 1]) && !SimpleTokenizer.IsCjk(text[start + 1]))) { continue; } bool MatchFunc(Match o) => o.Index == start && o.Length == length; if (matchSource.Keys.Any(MatchFunc)) { var srcMatch = matchSource.Keys.First(MatchFunc); result.Add(new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null, }); } } } else { lastNotMatched = i; } } return(result); }
private void MergePureNumber(string source, List <ExtractResult> ers) { var numErs = config.UnitNumExtractor.Extract(source); var unitNumbers = new List <ExtractResult>(); for (int i = 0, j = 0; i < numErs.Count; i++) { bool hasBehindExtraction = false; while (j < ers.Count && ers[j].Start + ers[j].Length < numErs[i].Start) { hasBehindExtraction = true; j++; } if (!hasBehindExtraction) { continue; } // Filter cases like "1 dollars 11a", "11" is not the fraction here. if (source.Length > numErs[i].Start + numErs[i].Length) { var endChar = source.Substring(numErs[i].Length + numErs[i].Start ?? 0, 1); if (char.IsLetter(endChar[0]) && !SimpleTokenizer.IsCjk(endChar[0])) { continue; } } var middleBegin = ers[j - 1].Start + ers[j - 1].Length ?? 0; var middleEnd = numErs[i].Start ?? 0; var middleStr = source.Substring(middleBegin, middleEnd - middleBegin).Trim(); // Separated by whitespace if (string.IsNullOrEmpty(middleStr)) { unitNumbers.Add(numErs[i]); continue; } // Separated by connectors var match = config.CompoundUnitConnectorRegex.Match(middleStr); if (match.Success && match.Index == 0 && match.Length == middleStr.Length) { unitNumbers.Add(numErs[i]); } } foreach (var extractResult in unitNumbers) { var overlap = false; foreach (var er in ers) { if (er.Start <= extractResult.Start && er.Start + er.Length >= extractResult.Start) { overlap = true; } } if (!overlap) { ers.Add(extractResult); } } ers.Sort((x, y) => x.Start - y.Start ?? 0); }