public override double GetElementFactor(string term, SpaceDocumentModel document)
        {
            if (!IsEnabled)
            {
                return(1);
            }


            TokenDictionary docDict = document.GetTerms(true, true);



            Double TF = docDict.GetTokenFrequency(term);

            switch (computation)
            {
            case TFComputation.modifiedTF:

                if (!index.ContainsKey(term))
                {
                    return(0);
                }

                Double Tt = index[term];         // training_terms.GetTokenFrequency(term);

                Double length_d = docDict.Count; //.GetTokenCount();

                Double mTF_above = TF * Math.Log(SqrTc / Tt);

                Double mTF_below_2nd = (length_d * length_d) / SqrTc;

                Double mTF_below = Math.Log(docDict.GetSumSquareFrequencies() * mTF_below_2nd);

                return(mTF_above / mTF_below);

                break;
            }


            Double divisor = GetDivisor(docDict);

            //if (TFN_index.ContainsKey(document))
            //{
            //    divisor = TFN_index[document];
            //}
            //else
            //{
            //    divisor
            //}

            switch (computation)
            {
            default:
            case TFComputation.normal:
                return(TF / divisor);

                break;

            case TFComputation.squareRooted:
                return(Math.Sqrt(TF / divisor));

                break;

            case TFComputation.glasgow:
                return(Math.Log(TF + 1) / divisor);

                break;
            }
        }
Exemple #2
0
        /// <summary>
        /// Constructs a document model
        /// </summary>
        /// <param name="text">The text.</param>
        /// <param name="name">The name.</param>
        /// <param name="context">The context.</param>
        /// <param name="stemmContext">The stemm context.</param>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <param name="metrics">The metrics.</param>
        /// <returns></returns>
        public SpaceDocumentModel ConstructDocument(string text, String name, SpaceModel context, StemmingContext stemmContext, ITokenizer tokenizer, Boolean isKnownDocument, ContentMetrics metrics = null)
        {
            var tokens = tokenizer.Tokenize(text);

            if (metrics != null)
            {
                metrics.TokensDoc += tokens.Length;                                // <----- token length
            }
            TokenDictionary tokenDictionary = new TokenDictionary(tokens);


            if (metrics != null)
            {
                metrics.UniqueTokensDoc += tokenDictionary.Count;                  // <---- unique tokens
            }
            TokenDictionary stemmDictionary = new TokenDictionary();

            List <String> tkn = tokenDictionary.GetTokens();

            for (int i2 = 0; i2 < tkn.Count; i2++)
            {
                String stk = stemmContext.Stem(tkn[i2]);
                stemmDictionary.CountToken(stk, tokenDictionary.GetTokenFrequency(tkn[i2]));
            }

            //  context.terms.MergeDictionary(stemmDictionary);

            if (metrics != null)
            {
                metrics.StemmedTokensDoc += stemmDictionary.Count;                  // <---- stemmed
            }
            SpaceDocumentModel document = new SpaceDocumentModel();

            document.name   = name;
            document.terms  = stemmDictionary;
            document.Length = tokens.Length;

            if (spaceSettings.DoMaintainWordIndex)
            {
                document.Words = new int[document.Length];
            }

            Int32 c = 0;

            for (int i = 0; i < tokens.Length; i++)
            {
                String stk = stemmContext.Stem(tokens[i]);

                if (isKnownDocument)
                {
                    context.terms_known_label.AddToken(stk);
                }
                else
                {
                    context.terms_unknown_label.AddToken(stk);
                }

                if (spaceSettings.DoMaintainWordIndex)
                {
                    document.Words[c] = context.terms.GetTokenID(stk);
                }
                c++;
            }

            document.name = name;

            // context.documents.Add(document);



            return(document);
        }