///// <summary>
        ///// Normalization divisor for document
        ///// </summary>
        ///// <value>
        ///// The index of the TFN.
        ///// </value>
        //protected Dictionary<SpaceDocumentModel, Double> TFN_index { get; set; } = new Dictionary<SpaceDocumentModel, Double>();



        public override void PrepareTheModel(SpaceModel _space, ILogBuilder log)
        {
            var space = _space;

            TokenDictionary training_terms = space.GetTerms(true, true);

            List <SpaceLabel> labels = space.labels.ToList();



            shortName = GetFunctionName(computation);

            if (!IsEnabled)
            {
                return;
            }


            switch (computation)
            {
            case TFComputation.modifiedTF:
                SqrTc = Math.Sqrt(training_terms.GetSumFrequency());

                break;

            default:
                //foreach (SpaceDocumentModel document in space.documents)
                //{
                //    TFN_index.Add(document, GetDivisor(document));
                //}
                break;
            }

            index = training_terms.ToFrequencyDictionary();
        }
Exemplo n.º 2
0
        /// <summary>
        /// Makes ranked table with term frequencies
        /// </summary>
        /// <param name="terms">The terms.</param>
        /// <param name="name">The name.</param>
        /// <param name="description">The description.</param>
        /// <param name="limit">The limit.</param>
        /// <returns></returns>
        public static DataTable MakeTable(this TokenDictionary terms, string name, string description, Int32 limit = 1000)
        {
            DataTable table = new DataTable();

            table.SetTitle(name);
            table.SetDescription(description);

            table.SetAdditionalInfoEntry("Dictinct terms", terms.Count, "Total distinct terms in the dictionary");
            table.SetAdditionalInfoEntry("Max frequency", terms.GetMaxFrequency(), "Highest frequency");
            table.SetAdditionalInfoEntry("Total tokens", terms.GetSumFrequency(), "Total number of tokens extracted from the corpus/document, i.e. sum of all frequencies");

            DataColumn column_rank  = table.Add("Rank", "Rank by frequency", "R", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20);
            DataColumn column_id    = table.Add("ID", "Token ID", "id", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20);
            DataColumn column_token = table.Add("Token", "Token", "t", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(50);
            DataColumn column_freq  = table.Add("Frequency", "Absolute number of token occurrences in the corpus/document", "TF", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(30);

            //  var tokens = terms.GetTokens();

            var   list = terms.GetRankedTokenFrequency(limit);
            Int32 c    = 1;

            foreach (var pair in list)
            {
                var dr = table.NewRow();

                dr[column_rank]  = c;
                dr[column_id]    = terms.GetTokenID(pair.Key);
                dr[column_token] = pair.Key;
                dr[column_freq]  = pair.Value;
                c++;
                table.Rows.Add(dr);
            }

            if (terms.Count > limit)
            {
                table.AddExtra("Table contains only top [" + limit + "] entries, out of [" + terms.Count + "] enumerated in the dictionary");
            }

            return(table);
        }