double ITokenTable.CharsetComparisonScore(ITokenTable otherTable, double cutOf) { long startTime = DateTime.Now.Ticks; double hits = 0; foreach (TokenStats test in otherTable.Values) { int otherRank = this.RankOf(test.Token); if (otherRank != -1) { hits ++; } } double newScore = (hits / (double)this.Count) * 100; if (newScore < cutOf) { newScore = 0; } #if DIALOGUEMASTER if (UseCounters) { m_Counters.Comparisons.Increment(); m_Counters.ComparisonsPerSecond.Increment(); m_Counters.ComparisonTime.IncrementBy( (DateTime.Now.Ticks - startTime) / 100); m_Counters.ComparisonTimeBase.Increment(); } #endif return newScore; }
double ITokenTable.ComparisonScore(ITokenTable otherTable, double cutOf) { long startTime = DateTime.Now.Ticks; double maxScore = this.Ranks * otherTable.Count; double maxNegScore = this.Ranks * otherTable.Count; // int score = (int)(maxScore / 1.75); // Assume that at least 50% hits mast be in the first half of the table to be a scorer.... double score = this.Count * this.Count; double score2 = 0; foreach (TokenStats test in otherTable.Values) { int otherRank = this.RankOf(test.Token); if (otherRank == -1) score -= this.Ranks; else { double val = System.Math.Abs(test.Rank - otherRank); score -= val; score2 += this.Ranks - val; } if (score < cutOf) { score = 0; break; } } if (score < -1) score = -1; double newScore = Math.Max(0, (double)((score + 1) * ((double)score2 / (double)maxNegScore))); #if DIALOGUEMASTER if (UseCounters) { m_Counters.Comparisons.Increment(); m_Counters.ComparisonsPerSecond.Increment(); m_Counters.ComparisonTime.IncrementBy( (DateTime.Now.Ticks - startTime) / 100); m_Counters.ComparisonTimeBase.Increment(); } #endif return newScore; }
double ITokenTable.WordComparisonScore(ITokenTable otherTable, double cutOf, ref int hits) { long startTime = DateTime.Now.Ticks; double score = 0; hits = 0; long hitCount = 0; foreach(TokenStats test in otherTable.WordTable.Values) { double otherPos = this.WordTable.RankOf(test.Token); if (otherPos != -1) { double posScore = Math.Log((((double)(this.WordTable.Ranks - otherPos)) / (double)this.WordTable.Ranks) +1 ) * Math.Sqrt(test.Token.Length); // System.Console.Out.WriteLine(test.Token+"->"+otherPos.ToString()+" >> "+posScore.ToString()); score+=posScore; hitCount++; hits++; } } #if DIALOGUEMASTER if (UseCounters) { m_Counters.Comparisons.Increment(); m_Counters.ComparisonsPerSecond.Increment(); m_Counters.ComparisonTime.IncrementBy((DateTime.Now.Ticks - startTime) / 100); m_Counters.ComparisonTimeBase.Increment(); } #endif if (hits == 0) return 0; score = (score*100); if (score < cutOf) score = 0; return score; }
private void cbTables_SelectedIndexChanged(object sender, EventArgs e) { string lang = ((CultureInfo)cbTables.SelectedItem).TwoLetterISOLanguageName; ITokenTable table = this.m_Model[lang] as TokenTable; int maxRank = 0; int maxWordRank = 0; int maxCharRank = 0; lvNGrams.BeginUpdate(); lvNGrams.Items.Clear(); lvWords.BeginUpdate(); lvWords.Items.Clear(); listView3.BeginUpdate(); listView3.Items.Clear(); lvCharset.BeginUpdate(); lvCharset.Items.Clear(); foreach (string key in table.Keys) { ITokenStats stats = table[key]; maxRank = System.Math.Max(maxRank, stats.Rank); ListViewItem item = new ListViewItem(stats.Position.ToString()); item.SubItems.Add(stats.Token); item.SubItems.Add(stats.Rank.ToString()); item.SubItems.Add(stats.Occurences.ToString()); StringBuilder sbHex = new StringBuilder(); foreach (char c in stats.Token) { int Val = (int)c; if (c > 255) { sbHex.AppendFormat("{0}{1} ", Convert.ToString((byte)((c & 0xff00) >> 8), 16), Convert.ToString((byte)c, 16)); } else { sbHex.AppendFormat("{0} ", Convert.ToString((byte)c, 16)); } } item.SubItems.Add(sbHex.ToString().Trim()); lvNGrams.Items.Add(item); } ITokenTable wordTable = table.WordTable; foreach (string key in wordTable.Keys) { ITokenStats stats = wordTable[key]; maxWordRank = System.Math.Max(maxWordRank, stats.Rank); ListViewItem item = new ListViewItem(stats.Position.ToString()); item.SubItems.Add(stats.Token); item.SubItems.Add(stats.Rank.ToString()); item.SubItems.Add(stats.Occurences.ToString()); lvWords.Items.Add(item); } ITokenTable charsetTable = table.CharsetTable; foreach (string key in charsetTable.Keys) { ITokenStats stats = charsetTable[key]; maxCharRank = System.Math.Max(maxCharRank, stats.Rank); ListViewItem item = new ListViewItem(stats.Position.ToString()); item.SubItems.Add(stats.Token); item.SubItems.Add(stats.Rank.ToString()); item.SubItems.Add(stats.Occurences.ToString()); lvCharset.Items.Add(item); } this.tbTokenRanks.Text = maxRank.ToString(); this.tbWordRanks.Text = maxWordRank.ToString(); this.tbCharRanks.Text = maxCharRank.ToString(); double maxScore = 0; List <TableVoter> tables = new List <TableVoter>(); foreach (string key in this.m_Model.Keys) { TableVoter tableVoter = new TableVoter(key, table.ComparisonScore(this.m_Model[key], 0)); maxScore = Math.Max(maxScore, tableVoter.Score); tables.Add(tableVoter); } tables.Sort(); foreach (TableVoter voter in tables) { if (voter.Language != lang) { voter.Score /= maxScore; // voter.Score = 100 - voter.Score *200; voter.Score = Math.Max(0, 100 - (voter.Score * 150)); // from 90 on there is not enough similarity to be wrongly detected if (voter.Score > 90) { continue; } ListViewItem item = new ListViewItem(voter.Language); item.SubItems.Add(GetLanguageCulture(voter.Language).DisplayName.ToString()); item.SubItems.Add(voter.Score.ToString("0.00")); listView3.Items.Add(item); } } lvNGrams.EndUpdate(); lvWords.EndUpdate(); listView3.EndUpdate(); lvCharset.EndUpdate(); }
private void cbLangCompare_SelectedIndexChanged(object sender, EventArgs e) { this.lvAnalyzeCharsetResult.BeginUpdate(); this.lvAnalyzeCharsetResult.Items.Clear(); this.lvAnalyzeNGramsResult.BeginUpdate(); this.lvAnalyzeNGramsResult.Items.Clear(); this.lvAnalyzeWordResult.BeginUpdate(); this.lvAnalyzeWordResult.Items.Clear(); if (this.cbLangCompare.SelectedIndex != -1) { TokenTable testTable = new TokenTable(this.tbSource.Text); ITokenTable compareTable = this.m_Model[((CultureInfo)cbLangCompare.SelectedItem).TwoLetterISOLanguageName]; List <TokenStats> scores = new List <TokenStats>(); int score = compareTable.Count * testTable.Count; foreach (ITokenStats test in testTable.Values) { TokenStats newScore = new TokenStats(test.Token); int otherRank = compareTable.RankOf(test.Token); if (otherRank == -1) { newScore.Occurences = compareTable.Count; score -= compareTable.Count; } else { int val = System.Math.Abs(test.Rank - otherRank); newScore.Occurences = val; // abuse the ran field to store the occurences... newScore.Rank = test.Occurences; score -= val; scores.Add(newScore); } } tbSumTokens.Text = score.ToString(); scores.Sort(); for (int i = scores.Count - 1; i > -1; i--) { TokenStats stats = scores[i]; ListViewItem item = new ListViewItem(stats.Token); item.SubItems.Add(stats.Occurences.ToString()); item.SubItems.Add(stats.Rank.ToString()); lvAnalyzeNGramsResult.Items.Add(item); } scores = new List <TokenStats>(); score = compareTable.WordTable.Count * testTable.WordTable.Count; foreach (ITokenStats test in testTable.WordTable.Values) { TokenStats newScore = new TokenStats(test.Token); int otherRank = compareTable.WordTable.RankOf(test.Token); if (otherRank == -1) { newScore.Occurences = compareTable.Ranks; } else { int val = System.Math.Abs(test.Rank - otherRank); newScore.Occurences = val; newScore.Rank = test.Occurences; score -= val; scores.Add(newScore); } } int hits = 0; Double wsScore = compareTable.WordComparisonScore(testTable, 0, ref hits); tbSumWords.Text = wsScore.ToString("0.00") + " (" + hits + ")"; scores.Sort(); for (int i = scores.Count - 1; i > -1; i--) { TokenStats stats = scores[i]; ListViewItem item = new ListViewItem(stats.Token); item.SubItems.Add(stats.Occurences.ToString()); item.SubItems.Add(stats.Rank.ToString()); lvAnalyzeWordResult.Items.Add(item); } scores = new List <TokenStats>(); score = compareTable.CharsetTable.Count * testTable.CharsetTable.Count; foreach (ITokenStats test in testTable.CharsetTable.Values) { TokenStats newScore = new TokenStats(test.Token); int otherRank = compareTable.CharsetTable.RankOf(test.Token); if (otherRank == -1) { newScore.Occurences = compareTable.Ranks; } else { int val = System.Math.Abs(test.Rank - otherRank); newScore.Occurences = val; newScore.Rank = test.Occurences; score -= val; scores.Add(newScore); } } textBox1.Text = score.ToString(); scores.Sort(); for (int i = scores.Count - 1; i > -1; i--) { TokenStats stats = scores[i]; ListViewItem item = new ListViewItem(stats.Token); item.SubItems.Add(stats.Occurences.ToString()); item.SubItems.Add(stats.Rank.ToString()); lvAnalyzeCharsetResult.Items.Add(item); } } this.lvAnalyzeNGramsResult.EndUpdate(); this.lvAnalyzeWordResult.EndUpdate(); this.lvAnalyzeCharsetResult.EndUpdate(); }
public ICategoryList ClassifyText(string text, int maxResults) { long startTime = DateTime.Now.Ticks; ListDictionary results = new ListDictionary(); Dictionary <string, double> scores = new Dictionary <string, double>(); TokenTable tblTest = new TokenTable(text); double maxScore = 0; double threshold = 0; List <TokenVoter> charsetVoters = new List <TokenVoter>(); // collect stats based on charset (first filter) foreach (string category in this.Keys) { ITokenTable catTable = this[category]; if (!catTable.Enabled) { continue; } double score = catTable.CharsetTable.CharsetComparisonScore(tblTest.CharsetTable, threshold); if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { charsetVoters.Add(new TokenVoter(category, score)); } } // chinese does not have a "Charset"... so to be sure.... if ((charsetVoters.Count < 3) && (this.Keys.Contains("zh"))) { charsetVoters.Add(new TokenVoter("zh")); } charsetVoters.Sort(); for (int i = charsetVoters.Count - 1; i > -1; i--) { if (charsetVoters[i].Score < threshold) { charsetVoters.RemoveAt(i); } } maxScore = 0;; // collect scores for each table int maxWordHits = 0; threshold = 0; foreach (TokenVoter charVoter in charsetVoters) { ITokenTable catTable = this[charVoter.Category]; if (!catTable.Enabled) { continue; } int hits = 0; double score = catTable.WordComparisonScore(tblTest, threshold, ref hits); if (hits > maxWordHits) { maxWordHits = hits; } if (score > maxScore) { maxScore = score; threshold = (maxScore * this.m_Threshold); } if (score > threshold) { scores.Add(charVoter.Category, score); } } double sumScore = 0; List <TokenVoter> voters = new List <TokenVoter>(); if (scores.Count == 0) { maxScore = charsetVoters[0].Score;; // take the voters from the closed charsert foreach (TokenVoter v in charsetVoters) { scores.Add(v.Category, v.Score); } } threshold = (maxScore * m_Threshold); // copy the scores to a sorted voters list foreach (string key in scores.Keys) { /* if ((long)scores[key] < threshold) * continue; */ // calc sum score double score = scores[key]; /* * if (maxWordHits < 1) * { * score = 0; // score > 0 ? 1 : 0; * } * else*/ if (score > threshold) { score /= maxScore; if (maxWordHits > 0) { score /= maxWordHits; } score *= 100; sumScore += score; voters.Add(new TokenVoter(key, score)); } } if (voters.Count > 1) { if (sumScore > 0) { voters.Sort(); // cleanup voters and rebalance if more than 3 voters... if (voters.Count > m_MaxVoters) { sumScore = 0; for (int i = 0; i < m_MaxVoters; i++) { ((TokenVoter)voters[i]).Score -= ((TokenVoter)voters[m_MaxVoters]).Score; sumScore += ((TokenVoter)voters[i]).Score; } voters.RemoveRange(m_MaxVoters, voters.Count - m_MaxVoters); } } } // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so it we are 100% sure that's the one! ScoreHolder newScore = new ScoreHolder(100); results.Add(((TokenVoter)voters[0]).Category, newScore); } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = sumScore > 0 ? (stats.Score / sumScore) * 100 : 0; results.Add(stats.Category, new ScoreHolder(percScore)); } // if we have more than one possible result // we will try to disambiguate it by checking for // very common words if ((results.Count == 0) || (results.Count > 1)) { scores.Clear(); maxScore = 0; threshold = 0; // collect scores for each table foreach (string category in results.Keys) { ITokenTable catTable = (ITokenTable)this[category]; // threshold = tblTest.WordTable. Ranks*catTable.WordTable.Count; double score = catTable.ComparisonScore(tblTest, threshold); if (score > 0) { maxScore = System.Math.Max(maxScore, score); scores.Add(category, score); } } // got results? if (scores.Count > 0) { sumScore = 0; // copy the scores to a sorted voters list voters.Clear(); foreach (string key in scores.Keys) { // calc sum score sumScore += scores[key]; voters.Add(new TokenVoter(key, scores[key])); } voters.Sort(); // now normalize results.. // the results are not an absolute confidence // but relative if (voters.Count == 1) { // only one voter, so all other results are only 3/4 value foreach (string category in results.Keys) { if (category != ((TokenVoter)voters[0]).Category) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } else { for (int i = 0; i < voters.Count; i++) { TokenVoter stats = voters[i] as TokenVoter; double percScore = (stats.Score / sumScore) * 200; ((ScoreHolder)results[stats.Category]).AddScore(percScore); } foreach (string category in results.Keys) { ((ScoreHolder)results[category]).DevideScore(0.75); } } } } } // now build a proper result.. voters.Clear(); foreach (string key in results.Keys) { voters.Add(new TokenVoter(key, ((ScoreHolder)results[key]).Score)); } voters.Sort(); /* * // Do a distance to next boos * for (int i = 0; i < voters.Count-1; i++) * { * voters[i].Score += voters[i].Score - voters[i + 1].Score; * } */ // reduce to maximum results if (voters.Count > maxResults) { voters.RemoveRange(maxResults, voters.Count - maxResults); } // re-weight... double dSumScore = 0; foreach (TokenVoter voter in voters) { dSumScore += voter.Score; } results.Clear(); foreach (TokenVoter voter in voters) { results.Add(voter.Category, new ScoreHolder((voter.Score / dSumScore) * 100)); } // ArrayList resultList = new ArrayList(results.Values); // resultList.Sort CategoryList result = new CategoryList(); foreach (string category in results.Keys) { result.Add(new Category(category, ((ScoreHolder)results[category]).Score)); } result.Sort(); #if DIALOGUEMASTER if (UseCounters) { m_Counters.Classifications.Increment(); m_Counters.ClassificationsPerSecond.Increment(); m_Counters.ComparisonTime.IncrementBy(DateTime.Now.Ticks - startTime); m_Counters.ComparisonTimeBase.Increment(); } #endif tblTest.Clear(); return(result); }