Beispiel #1
0
        public override List <ExtractResult> Extract(string text)
        {
            var result = new List <ExtractResult>();

            if (string.IsNullOrEmpty(text))
            {
                return(result);
            }

            var matchSource = new Dictionary <Match, string>();
            var matched     = new bool[text.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    for (var j = 0; j < m.Length; j++)
                    {
                        matched[m.Index + j] = true;
                    }

                    // Keep Source Data for extra information
                    matchSource.Add(m, collection.Value);
                }
            }

            var lastNotMatched = -1;

            for (var i = 0; i < text.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == text.Length || !matched[i + 1])
                    {
                        var start  = lastNotMatched + 1;
                        var length = i - lastNotMatched;
                        var substr = text.Substring(start, length);
                        if (substr.StartsWith(Constants.IPV6_ELLIPSIS) &&
                            (start > 0 && char.IsLetterOrDigit(text[start - 1]) && !SimpleTokenizer.IsCjk(text[start - 1])))
                        {
                            continue;
                        }

                        if (substr.EndsWith(Constants.IPV6_ELLIPSIS) &&
                            (i + 1 < text.Length && char.IsLetterOrDigit(text[i + 1]) && !SimpleTokenizer.IsCjk(text[start + 1])))
                        {
                            continue;
                        }

                        bool MatchFunc(Match o) => o.Index == start && o.Length == length;

                        if (matchSource.Keys.Any(MatchFunc))
                        {
                            var srcMatch = matchSource.Keys.First(MatchFunc);
                            result.Add(new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null,
                            });
                        }
                    }
                }
                else
                {
                    lastNotMatched = i;
                }
            }

            return(result);
        }
        private void MergePureNumber(string source, List <ExtractResult> ers)
        {
            var numErs = config.UnitNumExtractor.Extract(source);

            var unitNumbers = new List <ExtractResult>();

            for (int i = 0, j = 0; i < numErs.Count; i++)
            {
                bool hasBehindExtraction = false;
                while (j < ers.Count && ers[j].Start + ers[j].Length < numErs[i].Start)
                {
                    hasBehindExtraction = true;
                    j++;
                }

                if (!hasBehindExtraction)
                {
                    continue;
                }

                // Filter cases like "1 dollars 11a", "11" is not the fraction here.
                if (source.Length > numErs[i].Start + numErs[i].Length)
                {
                    var endChar = source.Substring(numErs[i].Length + numErs[i].Start ?? 0, 1);
                    if (char.IsLetter(endChar[0]) && !SimpleTokenizer.IsCjk(endChar[0]))
                    {
                        continue;
                    }
                }

                var middleBegin = ers[j - 1].Start + ers[j - 1].Length ?? 0;
                var middleEnd   = numErs[i].Start ?? 0;

                var middleStr = source.Substring(middleBegin, middleEnd - middleBegin).Trim();

                // Separated by whitespace
                if (string.IsNullOrEmpty(middleStr))
                {
                    unitNumbers.Add(numErs[i]);
                    continue;
                }

                // Separated by connectors
                var match = config.CompoundUnitConnectorRegex.Match(middleStr);
                if (match.Success && match.Index == 0 && match.Length == middleStr.Length)
                {
                    unitNumbers.Add(numErs[i]);
                }
            }

            foreach (var extractResult in unitNumbers)
            {
                var overlap = false;
                foreach (var er in ers)
                {
                    if (er.Start <= extractResult.Start && er.Start + er.Length >= extractResult.Start)
                    {
                        overlap = true;
                    }
                }

                if (!overlap)
                {
                    ers.Add(extractResult);
                }
            }

            ers.Sort((x, y) => x.Start - y.Start ?? 0);
        }