/// <summary> /// Compute jaccard index: |A inter B| / |A union B|. /// </summary> /// <param name="s1">First string</param> /// <param name="s2">Second string</param> /// <returns>Similarity</returns> public double Similarity(string s1, string s2) { KShingling ks = new KShingling(k); int[] profile1 = ks.GetArrayProfile(s1); int[] profile2 = ks.GetArrayProfile(s2); int length = Math.Max(profile1.Length, profile2.Length); profile1 = profile1.WithPadding(length); profile2 = profile2.WithPadding(length); int inter = 0; int union = 0; for (int i = 0; i < length; i++) { if (profile1[i] > 0 || profile2[i] > 0) { union++; if (profile1[i] > 0 && profile2[i] > 0) { inter++; } } } return(1.0 * inter / union); }
public double Similarity(string s1, string s2) { if (s1.Length < k || s2.Length < k) { return(0); } KShingling ks = new KShingling(k); int[] profile1 = ks.GetArrayProfile(s1); int[] profile2 = ks.GetArrayProfile(s2); return(DotProduct(profile1, profile2) / (Norm(profile1) * Norm(profile2))); }
/// <summary> /// The distance between two strings is defined as the L1 norm of the /// difference of their profiles (the number of occurence of each k-shingle). /// </summary> /// <param name="s1">The first string</param> /// <param name="s2">The second string</param> /// <returns></returns> public double Distance(string s1, string s2) { var ks = new KShingling(k); int[] profile1 = ks.GetArrayProfile(s1); int[] profile2 = ks.GetArrayProfile(s2); int length = Math.Max(profile1.Length, profile2.Length); //profile1 = java.util.Arrays.copyOf(profile1, length); //profile2 = java.util.Arrays.copyOf(profile2, length); profile1 = profile1.WithPadding(length); profile2 = profile2.WithPadding(length); int d = 0; for (int i = 0; i < length; i++) { d += Math.Abs(profile1[i] - profile2[i]); } return(d); }
/// <summary> /// Similarity is computed as 2 * |A inter B| / (|A| + |B|). /// </summary> /// <param name="s1">The first string</param> /// <param name="s2">The second string</param> /// <returns></returns> public double Similarity(string s1, string s2) { KShingling ks = new KShingling(k); int[] profile1 = ks.GetArrayProfile(s1); int[] profile2 = ks.GetArrayProfile(s2); int length = Math.Max(profile1.Length, profile2.Length); //profile1 = java.util.Arrays.copyOf(profile1, length); //profile2 = java.util.Arrays.copyOf(profile2, length); profile1 = profile1.WithPadding(length); profile2 = profile2.WithPadding(length); int inter = 0; int sum = 0; for (int i = 0; i < length; i++) { if (profile1[i] > 0 && profile2[i] > 0) { inter++; } if (profile1[i] > 0) { sum++; } if (profile2[i] > 0) { sum++; } } return(2.0 * inter / sum); }