public void AddFile(string category, string fileName, List <char> charsetFilter) { TokenTable newTable = new TokenTable(MAX_TOKENS_DEFAULT); newTable.CreateFromFile(fileName, charsetFilter); this.Add(category, newTable); }
public TokenTable(int maxTokens) { this.m_WordTable = new TokenTable(this.m_MaxTokens, true); this.m_CharsetTable = new TokenTable(50, false); this.m_MaxTokens = maxTokens; #if DIALOGUEMASTER this.m_InnerDict = new RemotableValuesDictionary<string, ITokenStats>(m_MaxTokens); if (UseCounters && (m_Counters == null)) { Installer.InstallCounters(); m_Counters = new TableCounters(); } #else this.m_InnerDict = new Dictionary<string, ITokenStats>(m_MaxTokens); #endif }
public void GetObjectData(SerializationInfo info, StreamingContext context) { String[] objArray1 = new string[this.Count]; TokenTable[] objArray2 = new TokenTable[this.Count]; int i = 0; foreach (String key in this.Keys) { objArray1[i] = key; i++; } i = 0; foreach (TokenTable val in this.Values) { objArray2[i] = val; i++; } info.AddValue("Keys", objArray1, typeof(String[])); info.AddValue("Values", objArray2, typeof(TokenTable[])); }
public new void OnDeserialization(object sender) { if (this.m_siInfo == null) { throw new SerializationException("Something went wrong during deserialization"); } this.m_MaxTokens = this.m_siInfo.GetInt32("MaxTokens"); this.m_MaxWordLen = this.m_siInfo.GetInt32("MaxWordLen"); this.m_Ranks = this.m_siInfo.GetInt32("Ranks"); this.m_WordTable = this.m_siInfo.GetValue("WordTable", typeof(TokenTable)) as TokenTable; String[] objArray1 = (String[])this.m_siInfo.GetValue("Keys",typeof(String[])); TokenStats[] objArray2 = (TokenStats[])this.m_siInfo.GetValue("Values",typeof(TokenStats[])); for(int i=0;i<objArray1.Length;i++) this.Add(objArray1[i],objArray2[i]); // rebuild CharTable this.m_CharsetTable = new TokenTable(); this.BuildCharTable(); }
public ICategoryList ClassifyText(string text, int maxResults) { long startTime = DateTime.Now.Ticks; ListDictionary results = new ListDictionary(); Dictionary <string, double> scores = new Dictionary <string, double>(); TokenTable tblTest = new TokenTable(text); double maxScore = 0; double threshold = 0; List <TokenVoter> charsetVoters = new List <TokenVoter>(); // collect stats based on charset (first filter) foreach (string category in this.Keys) { ITokenTable catTable = this[category]; if (!catTable.Enabled) { continue; } double score = catTable.CharsetTable.CharsetComparisonScore(tblTest.CharsetTable, threshold); if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { charsetVoters.Add(new TokenVoter(category, score)); } } // chinese does not have a "Charset"... so to be sure.... if ((charsetVoters.Count < 3) && (this.Keys.Contains("zh"))) { charsetVoters.Add(new TokenVoter("zh")); } charsetVoters.Sort(); for (int i = charsetVoters.Count - 1; i > -1; i--) { if (charsetVoters[i].Score < threshold) { charsetVoters.RemoveAt(i); } } maxScore = 0;; // collect scores for each table int maxWordHits = 0; threshold = 0; foreach (TokenVoter charVoter in charsetVoters) { ITokenTable catTable = this[charVoter.Category]; if (!catTable.Enabled) { continue; } int hits = 0; double score = catTable.WordComparisonScore(tblTest, threshold, ref hits); if (hits > maxWordHits) { maxWordHits = hits; } if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { scores.Add(charVoter.Category, score); } } double sumScore = 0; List <TokenVoter> voters = new List <TokenVoter>(); if (scores.Count == 0) { maxScore = charsetVoters[0].Score;; // take the voters from the closed charsert foreach (TokenVoter v in charsetVoters) { scores.Add(v.Category, v.Score); } } threshold = (maxScore * m_Threshold); // copy the scores to a sorted voters list foreach (string key in scores.Keys) { /* if ((long)scores[key] < threshold) * continue; */ // calc sum score double score = scores[key]; /* * if (maxWordHits < 1) * { * score = 0; // score > 0 ? 1 : 0; * } * else*/ if (score > threshold) { score /= maxScore; if (maxWordHits > 0) { score /= maxWordHits; } score *= 100; sumScore += score; voters.Add(new TokenVoter(key, score)); } } if (voters.Count > 1) { if (sumScore > 0) { voters.Sort(); // cleanup voters and rebalance if more than 3 voters... if (voters.Count > m_MaxVoters) { sumScore = 0; for (int i = 0; i < m_MaxVoters; i++) { ((TokenVoter)voters[i]).Score -= ((TokenVoter)voters[m_MaxVoters]).Score; sumScore += ((TokenVoter)voters[i]).Score; } voters.RemoveRange(m_MaxVoters, voters.Count - m_MaxVoters); } } } // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so it we are 100% sure that's the one! ScoreHolder newScore = new ScoreHolder(100); results.Add(((TokenVoter)voters[0]).Category, newScore); } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = sumScore > 0 ? (stats.Score / sumScore) * 100 : 0; results.Add(stats.Category, new ScoreHolder(percScore)); } // if we have more than one possible result // we will try to disambiguate it by checking for // very common words if ((results.Count == 0) || (results.Count > 1)) { scores.Clear(); maxScore = 0; threshold = 0; // collect scores for each table foreach (string category in results.Keys) { ITokenTable catTable = (ITokenTable)this[category]; // threshold = tblTest.WordTable. Ranks*catTable.WordTable.Count; double score = catTable.ComparisonScore(tblTest, threshold); if (score > 0) { maxScore = System.Math.Max(maxScore, score); scores.Add(category, score); } } // got results? if (scores.Count > 0) { sumScore = 0; // copy the scores to a sorted voters list voters.Clear(); foreach (string key in scores.Keys) { // calc sum score sumScore += scores[key]; voters.Add(new TokenVoter(key, scores[key])); } voters.Sort(); // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so all other results are only 3/4 value foreach (string category in results.Keys) { if (category != ((TokenVoter)voters[0]).Category) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = (stats.Score / sumScore) * 200; ((ScoreHolder)results[stats.Category]).AddScore(percScore); } foreach (string category in results.Keys) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } } } // now build a proper result.. voters.Clear(); foreach (string key in results.Keys) { voters.Add(new TokenVoter(key, ((ScoreHolder)results[key]).Score)); } voters.Sort(); /* * // Do a distance to next boos * for (int i = 0; i < voters.Count-1; i++) * { * voters[i].Score += voters[i].Score - voters[i + 1].Score; * } */ // reduce to maximum results if (voters.Count > maxResults) { voters.RemoveRange(maxResults, voters.Count - maxResults); } // re-weight... double dSumScore = 0; foreach (TokenVoter voter in voters) { dSumScore += voter.Score; } results.Clear(); foreach (TokenVoter voter in voters) { results.Add(voter.Category, new ScoreHolder((voter.Score / dSumScore) * 100)); } // ArrayList resultList = new ArrayList(results.Values); // resultList.Sort CategoryList result = new CategoryList(); foreach (string category in results.Keys) { result.Add(new Category(category, ((ScoreHolder)results[category]).Score)); } result.Sort(); #if DIALOGUEMASTER if (UseCounters) { m_Counters.Classifications.Increment(); m_Counters.ClassificationsPerSecond.Increment(); m_Counters.ComparisonTime.IncrementBy(DateTime.Now.Ticks - startTime); m_Counters.ComparisonTimeBase.Increment(); } #endif tblTest.Clear(); return(result); }