private Double GetDivisor(TokenDictionary document)
        {
            if (document.Count == 0)
            {
                return(1);
            }

            switch (normalization)
            {
            case TFNormalization.squareRootOfSquareSum:
                return(document.GetSquareRootOfSumSquareFrequencies());

                break;

            default:
            case TFNormalization.divisionByMaxTF:
                return(document.GetMaxFrequency());

                break;
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Makes ranked table with term frequencies
        /// </summary>
        /// <param name="terms">The terms.</param>
        /// <param name="name">The name.</param>
        /// <param name="description">The description.</param>
        /// <param name="limit">The limit.</param>
        /// <returns></returns>
        public static DataTable MakeTable(this TokenDictionary terms, string name, string description, Int32 limit = 1000)
        {
            DataTable table = new DataTable();

            table.SetTitle(name);
            table.SetDescription(description);

            table.SetAdditionalInfoEntry("Dictinct terms", terms.Count, "Total distinct terms in the dictionary");
            table.SetAdditionalInfoEntry("Max frequency", terms.GetMaxFrequency(), "Highest frequency");
            table.SetAdditionalInfoEntry("Total tokens", terms.GetSumFrequency(), "Total number of tokens extracted from the corpus/document, i.e. sum of all frequencies");

            DataColumn column_rank  = table.Add("Rank", "Rank by frequency", "R", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20);
            DataColumn column_id    = table.Add("ID", "Token ID", "id", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20);
            DataColumn column_token = table.Add("Token", "Token", "t", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(50);
            DataColumn column_freq  = table.Add("Frequency", "Absolute number of token occurrences in the corpus/document", "TF", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(30);

            //  var tokens = terms.GetTokens();

            var   list = terms.GetRankedTokenFrequency(limit);
            Int32 c    = 1;

            foreach (var pair in list)
            {
                var dr = table.NewRow();

                dr[column_rank]  = c;
                dr[column_id]    = terms.GetTokenID(pair.Key);
                dr[column_token] = pair.Key;
                dr[column_freq]  = pair.Value;
                c++;
                table.Rows.Add(dr);
            }

            if (terms.Count > limit)
            {
                table.AddExtra("Table contains only top [" + limit + "] entries, out of [" + terms.Count + "] enumerated in the dictionary");
            }

            return(table);
        }