public bool IsMatch(Span <Token> tokens, out int consumedTokens) { int largestMatch = -1; for (int i = 0; i < Patterns.Count; i++) { var currentToken = 0; for (int j = 0; j < Patterns[i].Length; j++) { PatternUnit currentPattern = Patterns[i][j]; int ct = currentToken; bool hasMatched = false; while (ct < tokens.Length && currentPattern.IsMatch(ref tokens[ct])) { ct++; hasMatched = true; if (currentPattern.Mode == PatternMatchingMode.Single) { break; } } if (hasMatched) { currentToken = ct; } else { if (!currentPattern.Optional) { currentToken = int.MinValue; //Didn't match a mandatory token, so abort break; } } } if (largestMatch < currentToken) { largestMatch = currentToken; } } if (largestMatch > 0) { consumedTokens = largestMatch; return(true); } else { consumedTokens = 0; return(false); } }
public List <AbbreviationCandidate> ParseDocument(Document doc, Func <AbbreviationCandidate, bool> shouldSkip) { var found = new List <AbbreviationCandidate>(); if (doc.Language != Language && doc.Language != Language.Any) { return(found); } foreach (var span in doc) { var tokens = span.ToTokenSpan(); int N = tokens.Length - 2; for (int i = 0; i < N; i++) { if (CapturePattern.IsMatch(tokens.Slice(i), out var consumedTokens) && consumedTokens == 3) { var innerToken = tokens.Slice(i + 1, consumedTokens - 2)[0]; //Skips opening and closing parenthesis bool shouldDiscard = false; shouldDiscard |= DiscardOnlyLowerCase.IsMatch(ref innerToken); //All lower case shouldDiscard |= DiscardCommonWords.IsMatch(ref innerToken); shouldDiscard |= DiscardIsSymbol.IsMatch(ref innerToken); if (!shouldDiscard) { //Backtrack on the previous tokens to see if we find the explanation of the var lettersToMatch = innerToken.ValueAsSpan.ToArray().Where(c => char.IsUpper(c)).ToArray(); if (lettersToMatch.Length >= MinimumAbbreviationbLength && lettersToMatch.Length > (0.5 * innerToken.Length)) //Accept abbreviations with up to 50% lower-case letters, as long as they have enough upper-case letters { var matchedLetters = new bool[lettersToMatch.Length]; int maxTokensToTry = MaximumTokensToTestForDescriptionPerLetter * lettersToMatch.Length; int min = i - 1 - maxTokensToTry; if (min < 0) { min = 0; } for (int j = i - 1; j > min; j--) { var cur = tokens[j].ValueAsSpan; if (cur.IndexOfAny(Parenthesis) >= 0) { break; } //Try to consume tokens for (int k = 0; k < lettersToMatch.Length; k++) { if (cur.IndexOf(lettersToMatch[k]) >= 0) { matchedLetters[k] = true; } } if (matchedLetters.All(b => b)) { //Found all letters, so hopefully we have a match //Make sure now that the letters appear in sequence var fullSpan = doc.Value.AsSpan().Slice(tokens[j].Begin, tokens[i - 1].End - tokens[j].Begin + 1); if (AppearsIn(innerToken.ValueAsSpan, fullSpan) && !fullSpan.IsAllUpperCase()) { break; } if (IsSubSequenceOf(lettersToMatch.AsSpan(), fullSpan)) { var allUpper = fullSpan.ToArray().Where(c => char.IsUpper(c)).ToList(); var allUpperAbb = new HashSet <char>(lettersToMatch); while (allUpper.Count > 0 && allUpperAbb.Count > 0) { var c = allUpper[0]; if (allUpperAbb.Remove(c)) { allUpper.RemoveAt(0); } else { break; } } //Only add this as an abbreviation if the abbreviation contains all candidate description upper-case letters if (allUpper.Count == 0) { var context = GetContextForCandidate(doc, innerToken); var candidate = new AbbreviationCandidate { Abbreviation = innerToken.Value, Description = GetStandardForm(fullSpan), Context = context }; if (!shouldSkip(candidate)) { found.Add(candidate); } } break; } } } } } i += consumedTokens - 1; //-1 as we'll do an i++ imediatelly after } } } return(found); }
public bool IsMatch(ref Token token) { bool isMatch = true; if (token.Length < 1) { return(false); } //Empty tokens never match if (Mode == PatternMatchingMode.And) { return(LeftSide.IsMatch(ref token) && RightSide.IsMatch(ref token)); } else if (Mode == PatternMatchingMode.Or) { return(LeftSide.IsMatch(ref token) || RightSide.IsMatch(ref token)); } else { if (isMatch && (Type & PatternUnitType.Length) == PatternUnitType.Length) { isMatch &= MatchLength(ref token); } if (isMatch && (Type & PatternUnitType.Token) == PatternUnitType.Token) { isMatch &= MatchToken(ref token); } if (isMatch && (Type & PatternUnitType.Shape) == PatternUnitType.Shape) { isMatch &= MatchShape(ref token); } if (isMatch && (Type & PatternUnitType.WithChars) == PatternUnitType.WithChars) { isMatch &= MatchWithChars(ref token); } //if (isMatch && (Type & PatternUnitType.Script) == PatternUnitType.Script) { isMatch &= MatchScript (ref token); } if (isMatch && (Type & PatternUnitType.POS) == PatternUnitType.POS) { isMatch &= MatchPOS(ref token); } if (isMatch && (Type & PatternUnitType.MultiplePOS) == PatternUnitType.MultiplePOS) { isMatch &= MatchMultiplePOS(ref token); } if (isMatch && (Type & PatternUnitType.Suffix) == PatternUnitType.Suffix) { isMatch &= MatchSuffix(ref token); } if (isMatch && (Type & PatternUnitType.Prefix) == PatternUnitType.Prefix) { isMatch &= MatchPrefix(ref token); } if (isMatch && (Type & PatternUnitType.Set) == PatternUnitType.Set) { isMatch &= MatchSet(ref token); } if (isMatch && (Type & PatternUnitType.Entity) == PatternUnitType.Entity) { isMatch &= MatchEntity(ref token); } if (isMatch && (Type & PatternUnitType.NotEntity) == PatternUnitType.NotEntity) { isMatch &= !MatchEntity(ref token); } if (isMatch && (Type & PatternUnitType.IsDigit) == PatternUnitType.IsDigit) { isMatch &= MatchIsDigit(ref token); } if (isMatch && (Type & PatternUnitType.IsNumeric) == PatternUnitType.IsNumeric) { isMatch &= MatchIsNumeric(ref token); } if (isMatch && (Type & PatternUnitType.HasNumeric) == PatternUnitType.HasNumeric) { isMatch &= MatchHasNumeric(ref token); } if (isMatch && (Type & PatternUnitType.IsAlpha) == PatternUnitType.IsAlpha) { isMatch &= MatchIsAlpha(ref token); } if (isMatch && (Type & PatternUnitType.IsLetterOrDigit) == PatternUnitType.IsLetterOrDigit) { isMatch &= MatchIsLetterOrDigit(ref token); } //if (isMatch && (Type & PatternUnitType.IsLatin) == PatternUnitType.IsLatin) { isMatch &= MatchIsLatin (ref token); } if (isMatch && (Type & PatternUnitType.IsEmoji) == PatternUnitType.IsEmoji) { isMatch &= MatchIsEmoji(ref token); } if (isMatch && (Type & PatternUnitType.IsPunctuation) == PatternUnitType.IsPunctuation) { isMatch &= MatchIsPunctuation(ref token); } if (isMatch && (Type & PatternUnitType.IsLowerCase) == PatternUnitType.IsLowerCase) { isMatch &= MatchIsLowerCase(ref token); } if (isMatch && (Type & PatternUnitType.IsUpperCase) == PatternUnitType.IsUpperCase) { isMatch &= MatchIsUpperCase(ref token); } if (isMatch && (Type & PatternUnitType.IsTitleCase) == PatternUnitType.IsTitleCase) { isMatch &= MatchIsTitleCase(ref token); } if (isMatch && (Type & PatternUnitType.LikeURL) == PatternUnitType.LikeURL) { isMatch &= MatchLikeURL(ref token); } if (isMatch && (Type & PatternUnitType.LikeEmail) == PatternUnitType.LikeEmail) { isMatch &= MatchLikeEmail(ref token); } if (isMatch && (Type & PatternUnitType.IsOpeningParenthesis) == PatternUnitType.IsOpeningParenthesis) { isMatch &= MatchIsOpeningParenthesis(ref token); } if (isMatch && (Type & PatternUnitType.IsClosingParenthesis) == PatternUnitType.IsClosingParenthesis) { isMatch &= MatchIsClosingParenthesis(ref token); } } return(Mode == PatternMatchingMode.ShouldNotMatch ? !isMatch : isMatch); }