private double GetActualSimilarity(Collection <string> firstTokens, Collection <string> secondTokens) { Collection <string> collection = _tokenUtilities.CreateMergedSet(firstTokens, secondTokens); int num = 0; foreach (string str in collection) { int num2 = 0; for (int i = 0; i < firstTokens.Count; i++) { if (firstTokens[i].Equals(str)) { num2++; } } int num4 = 0; for (int j = 0; j < secondTokens.Count; j++) { if (secondTokens[j].Equals(str)) { num4++; } } if (num2 > num4) { num += num2 - num4; } else { num += num4 - num2; } } return(num); }
public override double GetSimilarity(string firstWord, string secondWord) { if (firstWord != null && secondWord != null && _tokenUtilities.CreateMergedSet(_tokeniser.Tokenize(firstWord), _tokeniser.Tokenize(secondWord)).Count > 0) { return(_tokenUtilities.CommonSetTerms() / (Math.Pow(_tokenUtilities.FirstSetTokenCount, 0.5) * Math.Pow(_tokenUtilities.SecondSetTokenCount, 0.5))); } return(0.0); }
public override double GetSimilarity(string firstWord, string secondWord) { if (firstWord != null && secondWord != null) { _tokenUtilities.CreateMergedSet(_tokeniser.Tokenize(firstWord), _tokeniser.Tokenize(secondWord)); return(_tokenUtilities.CommonSetTerms() / (double)Math.Min(_tokenUtilities.FirstSetTokenCount, _tokenUtilities.SecondSetTokenCount)); } return(DefaultMismatchScore); }
public override double GetSimilarity(string firstWord, string secondWord) { if (firstWord != null && secondWord != null) { Collection <string> collection = _tokenUtilities.CreateMergedSet(_tokeniser.Tokenize(firstWord), _tokeniser.Tokenize(secondWord)); if (collection.Count > 0) { return(_tokenUtilities.CommonSetTerms() / (double)collection.Count); } } return(DefaultMismatchScore); }
/// <summary> /// gets the similarity of the two strings using OverlapCoefficient /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a value between 0-1 of the similarity</returns> /// <remarks>overlap_coefficient(q,r) = ( | q and r | ) / min{ | q | , | r | }.</remarks> public override double GetSimilarity(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { //Collection<string> allTokens = tokenUtilities.CreateMergedSet(tokeniser.Tokenize(firstWord), tokeniser.Tokenize(secondWord)); return (tokenUtilities.CommonSetTerms() / (double)Math.Min(tokenUtilities.FirstSetTokenCount, tokenUtilities.SecondSetTokenCount)); } return(defaultMismatchScore); }
/// <summary> /// gets the similarity of the two strings using DiceSimilarity /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a value between 0-1 of the similarity</returns> /// <remarks>Dices coefficient = (2*Common Terms) / (Number of terms in String1 + Number of terms in String2).</remarks> public override double GetSimilarity(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { if (tokenUtilities.CreateMergedSet(tokeniser.Tokenize(firstWord), tokeniser.Tokenize(secondWord)).Count > 0) { return ((2.0 * tokenUtilities.CommonSetTerms()) / (tokenUtilities.FirstSetTokenCount + tokenUtilities.SecondSetTokenCount)); } } return(0.0); }
/// <summary> /// gets the similarity of the two strings using JaccardSimilarity. /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a value between 0-1 of the similarity</returns> /// <remarks>Each instance is represented as a Jaccard vector similarity function. The Jaccard between two vectors X and Y is /// (X*Y) / (|X||Y|-(X*Y)) /// where (X*Y) is the inner product of X and Y, and |X| = (X*X)^1/2, i.e. the Euclidean norm of X. /// This can more easily be described as ( |X and Y| ) / ( | X or Y | )</remarks> public override double GetSimilarity(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { Collection <string> allTokens = tokenUtilities.CreateMergedSet(tokeniser.Tokenize(firstWord), tokeniser.Tokenize(secondWord)); if (allTokens.Count > 0) { return((double)tokenUtilities.CommonSetTerms() / (double)allTokens.Count); } } return(defaultMismatchScore); }
double GetActualSimilarity(Collection <string> firstTokens, Collection <string> secondTokens) { Collection <string> allTokens = tokenUtilities.CreateMergedSet(firstTokens, secondTokens); int difference = 0; foreach (string token in allTokens) { int matchingQGrams1 = 0; for (int i = 0; i < firstTokens.Count; i++) { if (firstTokens[i].Equals(token)) { matchingQGrams1++; } } int matchingQGrams2 = 0; for (int i = 0; i < secondTokens.Count; i++) { if (secondTokens[i].Equals(token)) { matchingQGrams2++; } } if (matchingQGrams1 > matchingQGrams2) { difference += matchingQGrams1 - matchingQGrams2; } else { difference += matchingQGrams2 - matchingQGrams1; } } return(difference); }