public override double GetElementFactor(string term, SpaceDocumentModel document) { if (!IsEnabled) { return(1); } TokenDictionary docDict = document.GetTerms(true, true); Double TF = docDict.GetTokenFrequency(term); switch (computation) { case TFComputation.modifiedTF: if (!index.ContainsKey(term)) { return(0); } Double Tt = index[term]; // training_terms.GetTokenFrequency(term); Double length_d = docDict.Count; //.GetTokenCount(); Double mTF_above = TF * Math.Log(SqrTc / Tt); Double mTF_below_2nd = (length_d * length_d) / SqrTc; Double mTF_below = Math.Log(docDict.GetSumSquareFrequencies() * mTF_below_2nd); return(mTF_above / mTF_below); break; } Double divisor = GetDivisor(docDict); //if (TFN_index.ContainsKey(document)) //{ // divisor = TFN_index[document]; //} //else //{ // divisor //} switch (computation) { default: case TFComputation.normal: return(TF / divisor); break; case TFComputation.squareRooted: return(Math.Sqrt(TF / divisor)); break; case TFComputation.glasgow: return(Math.Log(TF + 1) / divisor); break; } }
/// <summary> /// Constructs a document model /// </summary> /// <param name="text">The text.</param> /// <param name="name">The name.</param> /// <param name="context">The context.</param> /// <param name="stemmContext">The stemm context.</param> /// <param name="tokenizer">The tokenizer.</param> /// <param name="metrics">The metrics.</param> /// <returns></returns> public SpaceDocumentModel ConstructDocument(string text, String name, SpaceModel context, StemmingContext stemmContext, ITokenizer tokenizer, Boolean isKnownDocument, ContentMetrics metrics = null) { var tokens = tokenizer.Tokenize(text); if (metrics != null) { metrics.TokensDoc += tokens.Length; // <----- token length } TokenDictionary tokenDictionary = new TokenDictionary(tokens); if (metrics != null) { metrics.UniqueTokensDoc += tokenDictionary.Count; // <---- unique tokens } TokenDictionary stemmDictionary = new TokenDictionary(); List <String> tkn = tokenDictionary.GetTokens(); for (int i2 = 0; i2 < tkn.Count; i2++) { String stk = stemmContext.Stem(tkn[i2]); stemmDictionary.CountToken(stk, tokenDictionary.GetTokenFrequency(tkn[i2])); } // context.terms.MergeDictionary(stemmDictionary); if (metrics != null) { metrics.StemmedTokensDoc += stemmDictionary.Count; // <---- stemmed } SpaceDocumentModel document = new SpaceDocumentModel(); document.name = name; document.terms = stemmDictionary; document.Length = tokens.Length; if (spaceSettings.DoMaintainWordIndex) { document.Words = new int[document.Length]; } Int32 c = 0; for (int i = 0; i < tokens.Length; i++) { String stk = stemmContext.Stem(tokens[i]); if (isKnownDocument) { context.terms_known_label.AddToken(stk); } else { context.terms_unknown_label.AddToken(stk); } if (spaceSettings.DoMaintainWordIndex) { document.Words[c] = context.terms.GetTokenID(stk); } c++; } document.name = name; // context.documents.Add(document); return(document); }