예제 #1
0
        /// <summary>
        /// Makes ranked table with term frequencies
        /// </summary>
        /// <param name="terms">The terms.</param>
        /// <param name="name">The name.</param>
        /// <param name="description">The description.</param>
        /// <param name="limit">The limit.</param>
        /// <returns></returns>
        public static DataTable MakeTable(this TokenDictionary terms, string name, string description, Int32 limit = 1000)
        {
            DataTable table = new DataTable();

            table.SetTitle(name);
            table.SetDescription(description);

            table.SetAdditionalInfoEntry("Dictinct terms", terms.Count, "Total distinct terms in the dictionary");
            table.SetAdditionalInfoEntry("Max frequency", terms.GetMaxFrequency(), "Highest frequency");
            table.SetAdditionalInfoEntry("Total tokens", terms.GetSumFrequency(), "Total number of tokens extracted from the corpus/document, i.e. sum of all frequencies");

            DataColumn column_rank  = table.Add("Rank", "Rank by frequency", "R", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20);
            DataColumn column_id    = table.Add("ID", "Token ID", "id", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20);
            DataColumn column_token = table.Add("Token", "Token", "t", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(50);
            DataColumn column_freq  = table.Add("Frequency", "Absolute number of token occurrences in the corpus/document", "TF", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(30);

            //  var tokens = terms.GetTokens();

            var   list = terms.GetRankedTokenFrequency(limit);
            Int32 c    = 1;

            foreach (var pair in list)
            {
                var dr = table.NewRow();

                dr[column_rank]  = c;
                dr[column_id]    = terms.GetTokenID(pair.Key);
                dr[column_token] = pair.Key;
                dr[column_freq]  = pair.Value;
                c++;
                table.Rows.Add(dr);
            }

            if (terms.Count > limit)
            {
                table.AddExtra("Table contains only top [" + limit + "] entries, out of [" + terms.Count + "] enumerated in the dictionary");
            }

            return(table);
        }
예제 #2
0
        public static String MakeRankedList(this TokenDictionary terms, string name, string description, Int32 limit = 1000, String filepath = "")
        {
            StringBuilder sb = new StringBuilder();

            var   list = terms.GetRankedTokenFrequency(limit);
            Int32 c    = 1;

            sb.AppendLine("Name: " + name);
            sb.AppendLine("Description: " + description);
            sb.AppendLine("Distinct terms: " + terms.Count);

            if (limit > 0)
            {
                sb.AppendLine("Showing top: " + limit);
            }

            sb.AppendLine("# \t\t ID \t\t KEY \t\t TKN \t\t\t FREQ");

            foreach (var pair in list)
            {
                sb.AppendLine(c.ToString() + "\t\t" + terms.GetTokenID(pair.Key) + "\t\t" + pair.Key + "\t\t\t" + pair.Value);

                c++;
                if (limit > 0)
                {
                    if (c > limit)
                    {
                        break;
                    }
                }
            }

            if (!filepath.isNullOrEmpty())
            {
                File.WriteAllText(filepath, sb.ToString());
            }

            return(sb.ToString());
        }