private static void AddToList(List <TokenMatchInfo> result, TokenMatchInfo token, int ngramsLength) { if (token.TokenText.Length >= ngramsLength) { result.Add(token); } }
/// <summary> /// Creates all the multi-word tokens, up to tokens containing maximumWordCount words /// </summary> public static List <TokenMatchInfo> GetAllPossibleTokens(List <TokenMatchInfo> tokenList, int maximumWordCount, int ngramsLength) { var tokenListResult = new List <TokenMatchInfo>(); //adding single word tokens foreach (var token in tokenList) { AddToList(tokenListResult, token, ngramsLength); } //adding multi word tokens //looping on first word of this current token for (int firstWordIndex = 0; firstWordIndex < tokenList.Count; firstWordIndex++) { //the string that will be built upon as we add more words var currentWord = tokenList[firstWordIndex].TokenText; var startIndex = tokenList[firstWordIndex].StartIndex; //loop until we have either included as many words as maxWordCount, or we've reached the last word of the sentence for (int tokenSize = 1; tokenSize < maximumWordCount && tokenSize + firstWordIndex < tokenList.Count; tokenSize++) { //how many words forward will we include this time? forward word //adding last word to the string currentWord += " " + tokenList[firstWordIndex + tokenSize].TokenText; //create new token object var newToken = new TokenMatchInfo { TokenText = currentWord, StartIndex = startIndex, EndIndex = tokenList[firstWordIndex + tokenSize].EndIndex }; AddToList(tokenListResult, newToken, ngramsLength); } } return(tokenListResult); }
public bool Equals(TokenMatchInfo obj) { return(TokenText == obj.TokenText && StartIndex == obj.StartIndex && EndIndex == obj.EndIndex); }
public TokenMatchInfo(TokenMatchInfo token) { TokenText = token.TokenText; StartIndex = token.StartIndex; EndIndex = token.EndIndex; }
public static List <MatchResult> FilterByThreshold(float[] similarityValues, List <string> dataset, float threshold, TokenMatchInfo token = default) { var matchResult = new List <MatchResult>(); for (int i = 0; i < similarityValues.Length; i++) { if (similarityValues[i] >= threshold) { matchResult.Add(new MatchResult { SimilarityScore = similarityValues[i], TokenMatchInfo = token == default ? null : new TokenMatchInfo(token), DatabaseMatchInfo = new DatabaseMatchInfo() { MatchText = dataset[i], MatchIndex = i } });