public static List <String> getStringNGramSet(String item, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap)
        {
            var charSets = setAnalysisTools <Char> .getNGramSet <List <Char> >(item.ToCharArray().ToList(), N, mode);

            List <String> ngrams = new List <string>();

            foreach (List <char> charSet in charSets)
            {
                ngrams.Add(String.Concat(charSet));
            }
            return(ngrams);
        }
        /// <summary>
        /// Gets descriptive line about n-grams deconstruction of the specified word
        /// </summary>
        /// <param name="word">The word to be splitted into n-grams</param>
        /// <param name="N">Size of N-grams, e.g. for bigrams: N=2</param>
        /// <param name="mode">The slicing mode</param>
        /// <returns>Line used for debugging </returns>
        public static String getNGramsDescriptiveLine(String word, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap)
        {
            List <String> ngrams = getNGrams(word, N, mode);

            String line = "[" + word + "] (" + mode.ToString() + ", N=" + N + ") => ";

            foreach (String ng in ngrams)
            {
                line = line.add(ng, ", ");
            }

            return(line);
        }
Beispiel #3
0
        /// <summary>
        /// Breaks the specified word into <c>N</c>-grams
        /// </summary>
        /// <param name="word">The word to be splitted into n-grams</param>
        /// <param name="N">Size of N-grams, e.g. for bigrams: N=2</param>
        /// <param name="mode">The slicing mode</param>
        /// <returns>Set of NGrams</returns>
        public static List <String> getNGrams(String word, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap)
        {
            List <String> output = new List <string>();


            Int32 step    = 1;
            Int32 remnant = 0;

            switch (mode)
            {
            case nGramsModeEnum.overlap:
                step    = 1;
                remnant = 1;
                break;

            case nGramsModeEnum.ordinal:
                step    = N;
                remnant = 0;
                break;
            }

            if (word.Length <= N)
            {
                output.Add(word);
                return(output);
            }



            for (int i = 0; i < word.Length; i = i + step)
            {
                Int32 len = Math.Min(N, word.Length - i);
                if (len > remnant)
                {
                    output.Add(word.Substring(i, len));
                }
            }


            return(output);
        }
        /// <summary>
        /// Computes word similarity
        /// </summary>
        /// <param name="wordA">The word a.</param>
        /// <param name="wordB">The word b.</param>
        /// <param name="equationEnum">The equation enum.</param>
        /// <param name="nGramSize">Size of the n gram.</param>
        /// <param name="nGramMode">The n gram mode.</param>
        /// <returns></returns>
        public static Double GetSimilarity(String wordA, String wordB, nGramsSimilarityEquationEnum equationEnum = nGramsSimilarityEquationEnum.JaccardIndex, Int32 nGramSize = 2, nGramsModeEnum nGramMode = nGramsModeEnum.overlap)
        {
            var ngram_A = getNGrams(wordA, nGramSize, nGramMode);
            var ngram_B = getNGrams(wordB, nGramSize, nGramMode);

            switch (equationEnum)
            {
            default:
            case nGramsSimilarityEquationEnum.JaccardIndex:
                return(GetJaccardIndex(ngram_A, ngram_B));

                break;

            case nGramsSimilarityEquationEnum.DiceCoefficient:
                return(GetDiceCoefficient(ngram_A, ngram_B));

                break;

            case nGramsSimilarityEquationEnum.continualOverlapRatio:
                return(GetContinualOverlapRatio(ngram_A, ngram_B));

                break;
            }

            return(0);
        }
        /// <summary>
        /// Breaks the specified sequence into <c>N</c>-gram sub sequences
        /// </summary>
        /// <param name="sets">The sets.</param>
        /// <param name="N">Size of N-grams, e.g. for bigrams: N=2</param>
        /// <param name="mode">The slicing mode</param>
        /// <returns>
        /// Set of NGrams
        /// </returns>
        public List <List <List <T> > > getNGrams(List <List <T> > sets, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap)
        {
            List <List <List <T> > > output = new List <List <List <T> > >();

            Int32 step    = 1;
            Int32 remnant = 0;

            switch (mode)
            {
            case nGramsModeEnum.overlap:
            {
                step    = 1;
                remnant = 1;
                break;
            }

            case nGramsModeEnum.ordinal:
            {
                step    = N;
                remnant = 0;
                break;
            }

            default:
                throw new Exception("Unexpected Case");
            }

            if (sets.Count <= N)
            {
                output.Add(sets);
                return(output);
            }

            for (int i = 0; i < sets.Count; i = i + step)
            {
                Int32 len = Math.Min(N, sets.Count - i);
                if (len > remnant)
                {
                    output.Add(sets.GetRange(i, len));
                }
            }

            return(output);
        }
        /// <summary>
        /// Breaks the specified sequence of items into n-gram chunks
        /// </summary>
        /// <param name="items">The items.</param>
        /// <param name="N">The n.</param>
        /// <param name="mode">The mode.</param>
        /// <returns></returns>
        /// <exception cref="Exception">Unexpected Case</exception>
        public static List <TNGram> getNGramSet <TNGram>(List <T> items, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap) where TNGram : List <T>, new()
        {
            List <TNGram> output = new List <TNGram>();

            Int32 step    = 1;
            Int32 remnant = 0;

            switch (mode)
            {
            case nGramsModeEnum.overlap:
            {
                step    = 1;
                remnant = 1;
                break;
            }

            case nGramsModeEnum.ordinal:
            {
                step    = N;
                remnant = 0;
                break;
            }

            default:
                throw new Exception("Unexpected Case");
            }

            if (items.Count <= N)
            {
                TNGram nGram = new TNGram();
                nGram.AddRange(items);
                output.Add(nGram);
                return(output);
            }

            for (int i = 0; i < items.Count; i = i + step)
            {
                Int32 len = Math.Min(N, items.Count - i);
                if (len > remnant)
                {
                    TNGram nGram = new TNGram();
                    nGram.AddRange(items.GetRange(i, len));
                    output.Add(nGram);
                }
            }
            return(output);
        }