public bool IsMatch(Span <Token> tokens, out int consumedTokens)
        {
            int largestMatch = -1;

            for (int i = 0; i < Patterns.Count; i++)
            {
                var currentToken = 0;
                for (int j = 0; j < Patterns[i].Length; j++)
                {
                    PatternUnit currentPattern = Patterns[i][j];
                    int         ct             = currentToken;
                    bool        hasMatched     = false;

                    while (ct < tokens.Length && currentPattern.IsMatch(ref tokens[ct]))
                    {
                        ct++;
                        hasMatched = true;
                        if (currentPattern.Mode == PatternMatchingMode.Single)
                        {
                            break;
                        }
                    }

                    if (hasMatched)
                    {
                        currentToken = ct;
                    }
                    else
                    {
                        if (!currentPattern.Optional)
                        {
                            currentToken = int.MinValue; //Didn't match a mandatory token, so abort
                            break;
                        }
                    }
                }
                if (largestMatch < currentToken)
                {
                    largestMatch = currentToken;
                }
            }

            if (largestMatch > 0)
            {
                consumedTokens = largestMatch;
                return(true);
            }
            else
            {
                consumedTokens = 0;
                return(false);
            }
        }
        public List <AbbreviationCandidate> ParseDocument(Document doc, Func <AbbreviationCandidate, bool> shouldSkip)
        {
            var found = new List <AbbreviationCandidate>();

            if (doc.Language != Language && doc.Language != Language.Any)
            {
                return(found);
            }

            foreach (var span in doc)
            {
                var tokens = span.ToTokenSpan();
                int N      = tokens.Length - 2;

                for (int i = 0; i < N; i++)
                {
                    if (CapturePattern.IsMatch(tokens.Slice(i), out var consumedTokens) && consumedTokens == 3)
                    {
                        var innerToken = tokens.Slice(i + 1, consumedTokens - 2)[0]; //Skips opening and closing parenthesis

                        bool shouldDiscard = false;
                        shouldDiscard |= DiscardOnlyLowerCase.IsMatch(ref innerToken); //All lower case
                        shouldDiscard |= DiscardCommonWords.IsMatch(ref innerToken);
                        shouldDiscard |= DiscardIsSymbol.IsMatch(ref innerToken);

                        if (!shouldDiscard)
                        {
                            //Backtrack on the previous tokens to see if we find the explanation of the

                            var lettersToMatch = innerToken.ValueAsSpan.ToArray().Where(c => char.IsUpper(c)).ToArray();

                            if (lettersToMatch.Length >= MinimumAbbreviationbLength && lettersToMatch.Length > (0.5 * innerToken.Length)) //Accept abbreviations with up to 50% lower-case letters, as long as they have enough upper-case letters
                            {
                                var matchedLetters = new bool[lettersToMatch.Length];

                                int maxTokensToTry = MaximumTokensToTestForDescriptionPerLetter * lettersToMatch.Length;
                                int min            = i - 1 - maxTokensToTry;
                                if (min < 0)
                                {
                                    min = 0;
                                }

                                for (int j = i - 1; j > min; j--)
                                {
                                    var cur = tokens[j].ValueAsSpan;

                                    if (cur.IndexOfAny(Parenthesis) >= 0)
                                    {
                                        break;
                                    }

                                    //Try to consume tokens
                                    for (int k = 0; k < lettersToMatch.Length; k++)
                                    {
                                        if (cur.IndexOf(lettersToMatch[k]) >= 0)
                                        {
                                            matchedLetters[k] = true;
                                        }
                                    }

                                    if (matchedLetters.All(b => b))
                                    {
                                        //Found all letters, so hopefully we have a match
                                        //Make sure now that the letters appear in sequence
                                        var fullSpan = doc.Value.AsSpan().Slice(tokens[j].Begin, tokens[i - 1].End - tokens[j].Begin + 1);

                                        if (AppearsIn(innerToken.ValueAsSpan, fullSpan) && !fullSpan.IsAllUpperCase())
                                        {
                                            break;
                                        }

                                        if (IsSubSequenceOf(lettersToMatch.AsSpan(), fullSpan))
                                        {
                                            var allUpper    = fullSpan.ToArray().Where(c => char.IsUpper(c)).ToList();
                                            var allUpperAbb = new HashSet <char>(lettersToMatch);
                                            while (allUpper.Count > 0 && allUpperAbb.Count > 0)
                                            {
                                                var c = allUpper[0];
                                                if (allUpperAbb.Remove(c))
                                                {
                                                    allUpper.RemoveAt(0);
                                                }
                                                else
                                                {
                                                    break;
                                                }
                                            }

                                            //Only add this as an abbreviation if the abbreviation contains all candidate description upper-case letters
                                            if (allUpper.Count == 0)
                                            {
                                                var context = GetContextForCandidate(doc, innerToken);

                                                var candidate = new AbbreviationCandidate
                                                {
                                                    Abbreviation = innerToken.Value,
                                                    Description  = GetStandardForm(fullSpan),
                                                    Context      = context
                                                };

                                                if (!shouldSkip(candidate))
                                                {
                                                    found.Add(candidate);
                                                }
                                            }

                                            break;
                                        }
                                    }
                                }
                            }
                        }

                        i += consumedTokens - 1; //-1 as we'll do an i++ imediatelly after
                    }
                }
            }
            return(found);
        }
Beispiel #3
0
        public bool IsMatch(ref Token token)
        {
            bool isMatch = true;

            if (token.Length < 1)
            {
                return(false);
            }                                       //Empty tokens never match

            if (Mode == PatternMatchingMode.And)
            {
                return(LeftSide.IsMatch(ref token) && RightSide.IsMatch(ref token));
            }
            else if (Mode == PatternMatchingMode.Or)
            {
                return(LeftSide.IsMatch(ref token) || RightSide.IsMatch(ref token));
            }
            else
            {
                if (isMatch && (Type & PatternUnitType.Length) == PatternUnitType.Length)
                {
                    isMatch &= MatchLength(ref token);
                }
                if (isMatch && (Type & PatternUnitType.Token) == PatternUnitType.Token)
                {
                    isMatch &= MatchToken(ref token);
                }
                if (isMatch && (Type & PatternUnitType.Shape) == PatternUnitType.Shape)
                {
                    isMatch &= MatchShape(ref token);
                }
                if (isMatch && (Type & PatternUnitType.WithChars) == PatternUnitType.WithChars)
                {
                    isMatch &= MatchWithChars(ref token);
                }
                //if (isMatch && (Type & PatternUnitType.Script) == PatternUnitType.Script)                                 { isMatch &= MatchScript          (ref token); }
                if (isMatch && (Type & PatternUnitType.POS) == PatternUnitType.POS)
                {
                    isMatch &= MatchPOS(ref token);
                }
                if (isMatch && (Type & PatternUnitType.MultiplePOS) == PatternUnitType.MultiplePOS)
                {
                    isMatch &= MatchMultiplePOS(ref token);
                }
                if (isMatch && (Type & PatternUnitType.Suffix) == PatternUnitType.Suffix)
                {
                    isMatch &= MatchSuffix(ref token);
                }
                if (isMatch && (Type & PatternUnitType.Prefix) == PatternUnitType.Prefix)
                {
                    isMatch &= MatchPrefix(ref token);
                }
                if (isMatch && (Type & PatternUnitType.Set) == PatternUnitType.Set)
                {
                    isMatch &= MatchSet(ref token);
                }
                if (isMatch && (Type & PatternUnitType.Entity) == PatternUnitType.Entity)
                {
                    isMatch &= MatchEntity(ref token);
                }
                if (isMatch && (Type & PatternUnitType.NotEntity) == PatternUnitType.NotEntity)
                {
                    isMatch &= !MatchEntity(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsDigit) == PatternUnitType.IsDigit)
                {
                    isMatch &= MatchIsDigit(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsNumeric) == PatternUnitType.IsNumeric)
                {
                    isMatch &= MatchIsNumeric(ref token);
                }
                if (isMatch && (Type & PatternUnitType.HasNumeric) == PatternUnitType.HasNumeric)
                {
                    isMatch &= MatchHasNumeric(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsAlpha) == PatternUnitType.IsAlpha)
                {
                    isMatch &= MatchIsAlpha(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsLetterOrDigit) == PatternUnitType.IsLetterOrDigit)
                {
                    isMatch &= MatchIsLetterOrDigit(ref token);
                }
                //if (isMatch && (Type & PatternUnitType.IsLatin) == PatternUnitType.IsLatin)                               { isMatch &= MatchIsLatin         (ref token); }
                if (isMatch && (Type & PatternUnitType.IsEmoji) == PatternUnitType.IsEmoji)
                {
                    isMatch &= MatchIsEmoji(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsPunctuation) == PatternUnitType.IsPunctuation)
                {
                    isMatch &= MatchIsPunctuation(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsLowerCase) == PatternUnitType.IsLowerCase)
                {
                    isMatch &= MatchIsLowerCase(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsUpperCase) == PatternUnitType.IsUpperCase)
                {
                    isMatch &= MatchIsUpperCase(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsTitleCase) == PatternUnitType.IsTitleCase)
                {
                    isMatch &= MatchIsTitleCase(ref token);
                }
                if (isMatch && (Type & PatternUnitType.LikeURL) == PatternUnitType.LikeURL)
                {
                    isMatch &= MatchLikeURL(ref token);
                }
                if (isMatch && (Type & PatternUnitType.LikeEmail) == PatternUnitType.LikeEmail)
                {
                    isMatch &= MatchLikeEmail(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsOpeningParenthesis) == PatternUnitType.IsOpeningParenthesis)
                {
                    isMatch &= MatchIsOpeningParenthesis(ref token);
                }
                if (isMatch && (Type & PatternUnitType.IsClosingParenthesis) == PatternUnitType.IsClosingParenthesis)
                {
                    isMatch &= MatchIsClosingParenthesis(ref token);
                }
            }

            return(Mode == PatternMatchingMode.ShouldNotMatch ? !isMatch : isMatch);
        }