public override double GetSimilarity(string firstWord, string secondWord) { if (firstWord == null || secondWord == null) { return(DefaultMismatchScore); } Collection <string> collection = Tokeniser.Tokenize(firstWord); Collection <string> collection2 = Tokeniser.Tokenize(secondWord); double num = 0.0; for (int i = 0; i < collection.Count; i++) { string str = collection[i]; double num3 = 0.0; for (int j = 0; j < collection2.Count; j++) { string str2 = collection2[j]; double similarity = _internalStringMetric.GetSimilarity(str, str2); if (similarity > num3) { num3 = similarity; } } num += num3; } return(num / collection.Count); }
/// <summary> /// gets the similarity of the two strings using Monge Elkan. /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a value between 0-1 of the similarity</returns> public override double GetSimilarity(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { Collection <string> firstTokens = tokeniser.Tokenize(firstWord); Collection <string> secondTokens = tokeniser.Tokenize(secondWord); double sumMatches = 0.0; for (int i = 0; i < firstTokens.Count; i++) { string sToken = firstTokens[i]; double maxFound = 0.0; for (int j = 0; j < secondTokens.Count; j++) { string tToken = secondTokens[j]; double found = internalStringMetric.GetSimilarity(sToken, tToken); if (found > maxFound) { maxFound = found; } } sumMatches += maxFound; } return(sumMatches / firstTokens.Count); } return(defaultMismatchScore); }
/// <summary> /// gets the un-normalised similarity measure of the metric for the given strings.</summary> /// <param name="firstWord">first word</param> /// <param name="secondWord">second word</param> /// <returns> returns the score of the similarity measure (un-normalised)</returns> public override double GetUnnormalisedSimilarity(string firstWord, string secondWord) { Collection <string> firstTokens = tokeniser.Tokenize(firstWord); Collection <string> secondTokens = tokeniser.Tokenize(secondWord); tokenUtilities.CreateMergedList(firstTokens, secondTokens); return(GetActualSimilarity(firstTokens, secondTokens)); }
public override double GetSimilarity(string firstWord, string secondWord) { if (firstWord != null && secondWord != null) { _tokenUtilities.CreateMergedSet(_tokeniser.Tokenize(firstWord), _tokeniser.Tokenize(secondWord)); return(_tokenUtilities.CommonSetTerms() / (double)Math.Min(_tokenUtilities.FirstSetTokenCount, _tokenUtilities.SecondSetTokenCount)); } return(DefaultMismatchScore); }
public double GetEuclidDistance(string firstWord, string secondWord) { if (firstWord != null && secondWord != null) { Collection <string> firstTokens = _tokeniser.Tokenize(firstWord); Collection <string> secondTokens = _tokeniser.Tokenize(secondWord); return(GetActualDistance(firstTokens, secondTokens)); } return(0.0); }
public override double GetSimilarityTimingEstimated(string firstWord, string secondWord) { if (firstWord != null && secondWord != null) { double count = _tokeniser.Tokenize(firstWord).Count; double num2 = _tokeniser.Tokenize(secondWord).Count; return(num2 * count * _estimatedTimingConstant); } return(DefaultMismatchScore); }
/// <summary> /// gets the estimated time in milliseconds it takes to perform a similarity timing. /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>the estimated time in milliseconds taken to perform the similarity measure</returns> public override double GetSimilarityTimingEstimated(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { double firstTokens = tokeniser.Tokenize(firstWord).Count; double secondTokens = tokeniser.Tokenize(secondWord).Count; return(secondTokens * firstTokens * estimatedTimingConstant); } return(0.0); }
/// <summary> /// gets the actual euclidean distance ie not the value between 0-1. /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>the actual euclidean distance</returns> public double GetEuclidDistance(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { Collection <string> firstTokens = tokeniser.Tokenize(firstWord); Collection <string> secondTokens = tokeniser.Tokenize(secondWord); return(GetActualDistance(firstTokens, secondTokens)); } return(defaultMismatchScore); }
public override double GetSimilarity(string firstWord, string secondWord) { if (firstWord != null && secondWord != null) { Collection <string> collection = _tokenUtilities.CreateMergedSet(_tokeniser.Tokenize(firstWord), _tokeniser.Tokenize(secondWord)); if (collection.Count > 0) { return(_tokenUtilities.CommonSetTerms() / (double)collection.Count); } } return(DefaultMismatchScore); }
/// <summary> /// gets the similarity of the two strings using OverlapCoefficient /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a value between 0-1 of the similarity</returns> /// <remarks>overlap_coefficient(q,r) = ( | q and r | ) / min{ | q | , | r | }.</remarks> public override double GetSimilarity(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { //Collection<string> allTokens = tokenUtilities.CreateMergedSet(tokeniser.Tokenize(firstWord), tokeniser.Tokenize(secondWord)); return (tokenUtilities.CommonSetTerms() / (double)Math.Min(tokenUtilities.FirstSetTokenCount, tokenUtilities.SecondSetTokenCount)); } return(defaultMismatchScore); }
/// <summary> /// gets the similarity of the two strings using DiceSimilarity /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a value between 0-1 of the similarity</returns> /// <remarks>Dices coefficient = (2*Common Terms) / (Number of terms in String1 + Number of terms in String2).</remarks> public override double GetSimilarity(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { if (tokenUtilities.CreateMergedSet(tokeniser.Tokenize(firstWord), tokeniser.Tokenize(secondWord)).Count > 0) { return ((2.0 * tokenUtilities.CommonSetTerms()) / (tokenUtilities.FirstSetTokenCount + tokenUtilities.SecondSetTokenCount)); } } return(0.0); }
/// <summary> /// gets the similarity of the two strings using JaccardSimilarity. /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a value between 0-1 of the similarity</returns> /// <remarks>Each instance is represented as a Jaccard vector similarity function. The Jaccard between two vectors X and Y is /// (X*Y) / (|X||Y|-(X*Y)) /// where (X*Y) is the inner product of X and Y, and |X| = (X*X)^1/2, i.e. the Euclidean norm of X. /// This can more easily be described as ( |X and Y| ) / ( | X or Y | )</remarks> public override double GetSimilarity(string firstWord, string secondWord) { if ((firstWord != null) && (secondWord != null)) { Collection <string> allTokens = tokenUtilities.CreateMergedSet(tokeniser.Tokenize(firstWord), tokeniser.Tokenize(secondWord)); if (allTokens.Count > 0) { return((double)tokenUtilities.CommonSetTerms() / (double)allTokens.Count); } } return(defaultMismatchScore); }
/// <summary> /// gets the similarity of the two strings using BlockDistance. /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>a 0-1 similarity score</returns> public override double GetSimilarity(string firstWord, string secondWord) { Collection <string> firstTokens = tokeniser.Tokenize(firstWord); Collection <string> secondTokens = tokeniser.Tokenize(secondWord); int totalPossible = firstTokens.Count + secondTokens.Count; double totalDistance = GetActualSimilarity(firstTokens, secondTokens); return((totalPossible - totalDistance) / totalPossible); }
public override double GetSimilarity(string firstWord, string secondWord) { Collection <string> firstTokens = _tokeniser.Tokenize(firstWord); Collection <string> secondTokens = _tokeniser.Tokenize(secondWord); int num = firstTokens.Count + secondTokens.Count; double actualSimilarity = GetActualSimilarity(firstTokens, secondTokens); return((num - actualSimilarity) / num); }
public override double GetSimilarity(string firstWord, string secondWord) { if (firstWord != null && secondWord != null && _tokenUtilities.CreateMergedSet(_tokeniser.Tokenize(firstWord), _tokeniser.Tokenize(secondWord)).Count > 0) { return(_tokenUtilities.CommonSetTerms() / (Math.Pow(_tokenUtilities.FirstSetTokenCount, 0.5) * Math.Pow(_tokenUtilities.SecondSetTokenCount, 0.5))); } return(0.0); }