Пример #1
0
        private IDictionary <string, double[]> calculateConditional(string input, int length)
        {
            IDictionary <string, double[]> inputGrams = new Dictionary <string, double[]>();

            foreach (string g in GramTokenizer.tokenize(input, length, false))
            {
                if (inputGrams.ContainsKey(g))
                {
                    inputGrams[g][Enigma.ABSOLUTE]++;
                }
                else
                {
                    inputGrams[g] = new double[] { 1, 0 };
                }
            }

            double sum = inputGrams.Values.Sum(item => item[Enigma.ABSOLUTE]);

            foreach (double[] values in inputGrams.Values)
            {
                values[Enigma.PERCENTAGED] = values[Enigma.ABSOLUTE] / sum;
            }

            return(inputGrams);
        }
Пример #2
0
        private void ProcessWord(string workstring)
        {
            if (settings.ProcessUnknownSymbols == 0)
            {
                workstring = StringUtil.StripUnknownSymbols(validChars, workstring);
            }

            if (workstring.Length == 0)
            {
                return;
            }

            if (settings.CaseSensitivity == 0)
            {
                workstring = workstring.ToUpper();
            }

            foreach (string g in GramTokenizer.tokenize(workstring, settings.GrammLength, settings.BoundaryFragments == 1))
            {
                if (!grams.ContainsKey(g))
                {
                    grams[g] = new double[] { 1, 0, 0, 0 };
                }
                else
                {
                    grams[g][ABSOLUTE]++;
                }
            }
        }
Пример #3
0
        }//end calculateEntropy

        /// <summary>
        /// This method calculates a trigram log2 score of a given text on the basis of a given grams dictionary.
        /// Case is insensitive.
        /// </summary>
        /// <param name="input">The text to be scored</param>
        /// <param name="length">n-gram length</param>
        /// <returns>The trigram score result</returns>
        public double calculateNGrams(string input, int length, int valueSelection, bool weighted)
        {
            //this.statistics = new Dictionary<int, IDictionary<string, double[]>>();
            double score = 0;

            if (corpusBigrams == null && length == 2)
            {
                corpusBigrams = GetStatistics(length);
            }
            if (corpusTrigrams == null && length == 3)
            {
                corpusTrigrams = GetStatistics(length);
            }
            var corpus = GetStatistics(length);

            input = input.ToUpper();
            // FIXME: case handling?

            HashSet <string> inputGrams = new HashSet <string>();

            foreach (string g in GramTokenizer.tokenize(input, length, false))
            {
                // ensure each n-gram is counted only once
                if (!weighted || inputGrams.Add(g))
                {
                    if (corpus.ContainsKey(g))
                    {
                        score += corpus[g][valueSelection];
                        if (weighted)
                        {
                            weights(g, length);
                        }
                    }
                }
            }

            return(score);
        }
Пример #4
0
        /// <summary>
        /// This method calculates a trigram log2 score of a given text on the basis of a given grams dictionary.
        /// Case is insensitive.
        /// </summary>
        /// <param name="input">The text to be scored</param>
        /// <param name="length">n-gram length</param>
        /// <returns>The trigram score result</returns>
        private double calculateNGrams(string input, int length, int valueSelection)
        {
            double score = 0;
            IDictionary <string, double[]> corpusGrams = pluginFacade.GetStatistics(length);

            // FIXME: case handling?

            HashSet <string> inputGrams = new HashSet <string>();

            foreach (string g in GramTokenizer.tokenize(input, length, false))
            {
                // ensure each n-gram is counted only once
                if (inputGrams.Add(g))
                {
                    if (corpusGrams.ContainsKey(g))
                    {
                        score += corpusGrams[g][valueSelection];
                    }
                }
            }

            return(score);
        }