public static List <String> getStringNGramSet(String item, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap) { var charSets = setAnalysisTools <Char> .getNGramSet <List <Char> >(item.ToCharArray().ToList(), N, mode); List <String> ngrams = new List <string>(); foreach (List <char> charSet in charSets) { ngrams.Add(String.Concat(charSet)); } return(ngrams); }
/// <summary> /// Gets descriptive line about n-grams deconstruction of the specified word /// </summary> /// <param name="word">The word to be splitted into n-grams</param> /// <param name="N">Size of N-grams, e.g. for bigrams: N=2</param> /// <param name="mode">The slicing mode</param> /// <returns>Line used for debugging </returns> public static String getNGramsDescriptiveLine(String word, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap) { List <String> ngrams = getNGrams(word, N, mode); String line = "[" + word + "] (" + mode.ToString() + ", N=" + N + ") => "; foreach (String ng in ngrams) { line = line.add(ng, ", "); } return(line); }
/// <summary> /// Breaks the specified word into <c>N</c>-grams /// </summary> /// <param name="word">The word to be splitted into n-grams</param> /// <param name="N">Size of N-grams, e.g. for bigrams: N=2</param> /// <param name="mode">The slicing mode</param> /// <returns>Set of NGrams</returns> public static List <String> getNGrams(String word, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap) { List <String> output = new List <string>(); Int32 step = 1; Int32 remnant = 0; switch (mode) { case nGramsModeEnum.overlap: step = 1; remnant = 1; break; case nGramsModeEnum.ordinal: step = N; remnant = 0; break; } if (word.Length <= N) { output.Add(word); return(output); } for (int i = 0; i < word.Length; i = i + step) { Int32 len = Math.Min(N, word.Length - i); if (len > remnant) { output.Add(word.Substring(i, len)); } } return(output); }
/// <summary> /// Computes word similarity /// </summary> /// <param name="wordA">The word a.</param> /// <param name="wordB">The word b.</param> /// <param name="equationEnum">The equation enum.</param> /// <param name="nGramSize">Size of the n gram.</param> /// <param name="nGramMode">The n gram mode.</param> /// <returns></returns> public static Double GetSimilarity(String wordA, String wordB, nGramsSimilarityEquationEnum equationEnum = nGramsSimilarityEquationEnum.JaccardIndex, Int32 nGramSize = 2, nGramsModeEnum nGramMode = nGramsModeEnum.overlap) { var ngram_A = getNGrams(wordA, nGramSize, nGramMode); var ngram_B = getNGrams(wordB, nGramSize, nGramMode); switch (equationEnum) { default: case nGramsSimilarityEquationEnum.JaccardIndex: return(GetJaccardIndex(ngram_A, ngram_B)); break; case nGramsSimilarityEquationEnum.DiceCoefficient: return(GetDiceCoefficient(ngram_A, ngram_B)); break; case nGramsSimilarityEquationEnum.continualOverlapRatio: return(GetContinualOverlapRatio(ngram_A, ngram_B)); break; } return(0); }
/// <summary> /// Breaks the specified sequence into <c>N</c>-gram sub sequences /// </summary> /// <param name="sets">The sets.</param> /// <param name="N">Size of N-grams, e.g. for bigrams: N=2</param> /// <param name="mode">The slicing mode</param> /// <returns> /// Set of NGrams /// </returns> public List <List <List <T> > > getNGrams(List <List <T> > sets, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap) { List <List <List <T> > > output = new List <List <List <T> > >(); Int32 step = 1; Int32 remnant = 0; switch (mode) { case nGramsModeEnum.overlap: { step = 1; remnant = 1; break; } case nGramsModeEnum.ordinal: { step = N; remnant = 0; break; } default: throw new Exception("Unexpected Case"); } if (sets.Count <= N) { output.Add(sets); return(output); } for (int i = 0; i < sets.Count; i = i + step) { Int32 len = Math.Min(N, sets.Count - i); if (len > remnant) { output.Add(sets.GetRange(i, len)); } } return(output); }
/// <summary> /// Breaks the specified sequence of items into n-gram chunks /// </summary> /// <param name="items">The items.</param> /// <param name="N">The n.</param> /// <param name="mode">The mode.</param> /// <returns></returns> /// <exception cref="Exception">Unexpected Case</exception> public static List <TNGram> getNGramSet <TNGram>(List <T> items, Int32 N = 2, nGramsModeEnum mode = nGramsModeEnum.overlap) where TNGram : List <T>, new() { List <TNGram> output = new List <TNGram>(); Int32 step = 1; Int32 remnant = 0; switch (mode) { case nGramsModeEnum.overlap: { step = 1; remnant = 1; break; } case nGramsModeEnum.ordinal: { step = N; remnant = 0; break; } default: throw new Exception("Unexpected Case"); } if (items.Count <= N) { TNGram nGram = new TNGram(); nGram.AddRange(items); output.Add(nGram); return(output); } for (int i = 0; i < items.Count; i = i + step) { Int32 len = Math.Min(N, items.Count - i); if (len > remnant) { TNGram nGram = new TNGram(); nGram.AddRange(items.GetRange(i, len)); output.Add(nGram); } } return(output); }