Exemple #1
0
        /// <summary>
        /// Compute jaccard index: |A inter B| / |A union B|.
        /// </summary>
        /// <param name="s1">First string</param>
        /// <param name="s2">Second string</param>
        /// <returns>Similarity</returns>
        public double Similarity(string s1, string s2)
        {
            KShingling ks = new KShingling(k);

            int[] profile1 = ks.GetArrayProfile(s1);
            int[] profile2 = ks.GetArrayProfile(s2);

            int length = Math.Max(profile1.Length, profile2.Length);

            profile1 = profile1.WithPadding(length);
            profile2 = profile2.WithPadding(length);

            int inter = 0;
            int union = 0;

            for (int i = 0; i < length; i++)
            {
                if (profile1[i] > 0 || profile2[i] > 0)
                {
                    union++;

                    if (profile1[i] > 0 && profile2[i] > 0)
                    {
                        inter++;
                    }
                }
            }

            return(1.0 * inter / union);
        }
        public double Similarity(string s1, string s2)
        {
            if (s1.Length < k || s2.Length < k)
            {
                return(0);
            }

            KShingling ks = new KShingling(k);

            int[] profile1 = ks.GetArrayProfile(s1);
            int[] profile2 = ks.GetArrayProfile(s2);

            return(DotProduct(profile1, profile2) / (Norm(profile1) * Norm(profile2)));
        }
Exemple #3
0
        /// <summary>
        /// The distance between two strings is defined as the L1 norm of the
        /// difference of their profiles (the number of occurence of each k-shingle).
        /// </summary>
        /// <param name="s1">The first string</param>
        /// <param name="s2">The second string</param>
        /// <returns></returns>
        public double Distance(string s1, string s2)
        {
            var ks = new KShingling(k);

            int[] profile1 = ks.GetArrayProfile(s1);
            int[] profile2 = ks.GetArrayProfile(s2);

            int length = Math.Max(profile1.Length, profile2.Length);

            //profile1 = java.util.Arrays.copyOf(profile1, length);
            //profile2 = java.util.Arrays.copyOf(profile2, length);
            profile1 = profile1.WithPadding(length);
            profile2 = profile2.WithPadding(length);

            int d = 0;

            for (int i = 0; i < length; i++)
            {
                d += Math.Abs(profile1[i] - profile2[i]);
            }

            return(d);
        }
Exemple #4
0
        /// <summary>
        /// Similarity is computed as 2 * |A inter B| / (|A| + |B|).
        /// </summary>
        /// <param name="s1">The first string</param>
        /// <param name="s2">The second string</param>
        /// <returns></returns>
        public double Similarity(string s1, string s2)
        {
            KShingling ks = new KShingling(k);

            int[] profile1 = ks.GetArrayProfile(s1);
            int[] profile2 = ks.GetArrayProfile(s2);

            int length = Math.Max(profile1.Length, profile2.Length);

            //profile1 = java.util.Arrays.copyOf(profile1, length);
            //profile2 = java.util.Arrays.copyOf(profile2, length);
            profile1 = profile1.WithPadding(length);
            profile2 = profile2.WithPadding(length);

            int inter = 0;
            int sum   = 0;

            for (int i = 0; i < length; i++)
            {
                if (profile1[i] > 0 && profile2[i] > 0)
                {
                    inter++;
                }

                if (profile1[i] > 0)
                {
                    sum++;
                }

                if (profile2[i] > 0)
                {
                    sum++;
                }
            }

            return(2.0 * inter / sum);
        }
 public StringSet(SparseBooleanVector vector, KShingling ks)
 {
     _vector = vector;
     _ks     = ks;
 }
 public StringProfile(SparseIntegerVector vector, KShingling ks)
 {
     Vector = vector;
     _ks    = ks;
 }