private IDictionary <string, double[]> calculateConditional(string input, int length) { IDictionary <string, double[]> inputGrams = new Dictionary <string, double[]>(); foreach (string g in GramTokenizer.tokenize(input, length, false)) { if (inputGrams.ContainsKey(g)) { inputGrams[g][Enigma.ABSOLUTE]++; } else { inputGrams[g] = new double[] { 1, 0 }; } } double sum = inputGrams.Values.Sum(item => item[Enigma.ABSOLUTE]); foreach (double[] values in inputGrams.Values) { values[Enigma.PERCENTAGED] = values[Enigma.ABSOLUTE] / sum; } return(inputGrams); }
private void ProcessWord(string workstring) { if (settings.ProcessUnknownSymbols == 0) { workstring = StringUtil.StripUnknownSymbols(validChars, workstring); } if (workstring.Length == 0) { return; } if (settings.CaseSensitivity == 0) { workstring = workstring.ToUpper(); } foreach (string g in GramTokenizer.tokenize(workstring, settings.GrammLength, settings.BoundaryFragments == 1)) { if (!grams.ContainsKey(g)) { grams[g] = new double[] { 1, 0, 0, 0 }; } else { grams[g][ABSOLUTE]++; } } }
}//end calculateEntropy /// <summary> /// This method calculates a trigram log2 score of a given text on the basis of a given grams dictionary. /// Case is insensitive. /// </summary> /// <param name="input">The text to be scored</param> /// <param name="length">n-gram length</param> /// <returns>The trigram score result</returns> public double calculateNGrams(string input, int length, int valueSelection, bool weighted) { //this.statistics = new Dictionary<int, IDictionary<string, double[]>>(); double score = 0; if (corpusBigrams == null && length == 2) { corpusBigrams = GetStatistics(length); } if (corpusTrigrams == null && length == 3) { corpusTrigrams = GetStatistics(length); } var corpus = GetStatistics(length); input = input.ToUpper(); // FIXME: case handling? HashSet <string> inputGrams = new HashSet <string>(); foreach (string g in GramTokenizer.tokenize(input, length, false)) { // ensure each n-gram is counted only once if (!weighted || inputGrams.Add(g)) { if (corpus.ContainsKey(g)) { score += corpus[g][valueSelection]; if (weighted) { weights(g, length); } } } } return(score); }
/// <summary> /// This method calculates a trigram log2 score of a given text on the basis of a given grams dictionary. /// Case is insensitive. /// </summary> /// <param name="input">The text to be scored</param> /// <param name="length">n-gram length</param> /// <returns>The trigram score result</returns> private double calculateNGrams(string input, int length, int valueSelection) { double score = 0; IDictionary <string, double[]> corpusGrams = pluginFacade.GetStatistics(length); // FIXME: case handling? HashSet <string> inputGrams = new HashSet <string>(); foreach (string g in GramTokenizer.tokenize(input, length, false)) { // ensure each n-gram is counted only once if (inputGrams.Add(g)) { if (corpusGrams.ContainsKey(g)) { score += corpusGrams[g][valueSelection]; } } } return(score); }