示例#1
0
        double ITokenTable.CharsetComparisonScore(ITokenTable otherTable, double cutOf)
        {
            long startTime = DateTime.Now.Ticks;

            double hits = 0;
            foreach (TokenStats test in otherTable.Values)
            {
                int otherRank = this.RankOf(test.Token);
                if (otherRank != -1)
                {
                    hits ++;
                }
            }

            double newScore = (hits / (double)this.Count) * 100;
            if (newScore < cutOf)
            {
                newScore = 0;
            }


#if DIALOGUEMASTER
			if (UseCounters)
			{
				m_Counters.Comparisons.Increment();
				m_Counters.ComparisonsPerSecond.Increment();
				m_Counters.ComparisonTime.IncrementBy( (DateTime.Now.Ticks - startTime) / 100);
				m_Counters.ComparisonTimeBase.Increment();
			}
#endif
            return newScore;
        }
示例#2
0
        double ITokenTable.ComparisonScore(ITokenTable otherTable, double cutOf)
        {
            long startTime = DateTime.Now.Ticks;

            double maxScore = this.Ranks * otherTable.Count;
            double maxNegScore = this.Ranks * otherTable.Count;
        //     int score = (int)(maxScore / 1.75); // Assume that at least 50% hits mast be in the first half of the table to be a scorer....
            double score = this.Count * this.Count;
            double score2 = 0;



            foreach (TokenStats test in otherTable.Values)
            {
                int otherRank = this.RankOf(test.Token);
                if (otherRank == -1)
                    score -= this.Ranks;
                else
                {
                    double val = System.Math.Abs(test.Rank - otherRank);
                    
                    score -= val;
                    score2 += this.Ranks - val;
                }
                if (score < cutOf)
                {
                    score = 0;
                    break;
                }
            }

            if (score < -1)
                score = -1;

            double newScore = Math.Max(0, (double)((score + 1) * ((double)score2 / (double)maxNegScore)));


#if DIALOGUEMASTER
			if (UseCounters)
			{
				m_Counters.Comparisons.Increment();
				m_Counters.ComparisonsPerSecond.Increment();
				m_Counters.ComparisonTime.IncrementBy( (DateTime.Now.Ticks - startTime) / 100);
				m_Counters.ComparisonTimeBase.Increment();
			}
#endif
            return newScore;
        }
示例#3
0
        double ITokenTable.WordComparisonScore(ITokenTable otherTable, double cutOf, ref int hits)
		{
			long startTime = DateTime.Now.Ticks;

			double score = 0;
			hits = 0;
			long hitCount = 0;
			foreach(TokenStats test in otherTable.WordTable.Values)
			{
				double otherPos = this.WordTable.RankOf(test.Token);
				if (otherPos != -1)
				{
                    double posScore = Math.Log((((double)(this.WordTable.Ranks - otherPos)) / (double)this.WordTable.Ranks) +1 ) * Math.Sqrt(test.Token.Length);
					// System.Console.Out.WriteLine(test.Token+"->"+otherPos.ToString()+" >> "+posScore.ToString());
					score+=posScore;
					hitCount++;
					hits++;
				}
			}

#if DIALOGUEMASTER
			if (UseCounters)
			{
				m_Counters.Comparisons.Increment();
				m_Counters.ComparisonsPerSecond.Increment();
                m_Counters.ComparisonTime.IncrementBy((DateTime.Now.Ticks - startTime) / 100);
				m_Counters.ComparisonTimeBase.Increment();
			}
#endif
			if (hits == 0)
				return 0;
			score = (score*100);
            if (score < cutOf)
                score = 0;
            return score;

		}
示例#4
0
        private void cbTables_SelectedIndexChanged(object sender, EventArgs e)
        {
            string      lang  = ((CultureInfo)cbTables.SelectedItem).TwoLetterISOLanguageName;
            ITokenTable table = this.m_Model[lang] as TokenTable;

            int maxRank     = 0;
            int maxWordRank = 0;
            int maxCharRank = 0;

            lvNGrams.BeginUpdate();
            lvNGrams.Items.Clear();
            lvWords.BeginUpdate();
            lvWords.Items.Clear();
            listView3.BeginUpdate();
            listView3.Items.Clear();
            lvCharset.BeginUpdate();
            lvCharset.Items.Clear();

            foreach (string key in table.Keys)
            {
                ITokenStats stats = table[key];
                maxRank = System.Math.Max(maxRank, stats.Rank);
                ListViewItem item = new ListViewItem(stats.Position.ToString());
                item.SubItems.Add(stats.Token);
                item.SubItems.Add(stats.Rank.ToString());
                item.SubItems.Add(stats.Occurences.ToString());
                StringBuilder sbHex = new StringBuilder();
                foreach (char c in stats.Token)
                {
                    int Val = (int)c;
                    if (c > 255)
                    {
                        sbHex.AppendFormat("{0}{1} ", Convert.ToString((byte)((c & 0xff00) >> 8), 16), Convert.ToString((byte)c, 16));
                    }
                    else
                    {
                        sbHex.AppendFormat("{0} ", Convert.ToString((byte)c, 16));
                    }
                }
                item.SubItems.Add(sbHex.ToString().Trim());
                lvNGrams.Items.Add(item);
            }

            ITokenTable wordTable = table.WordTable;

            foreach (string key in wordTable.Keys)
            {
                ITokenStats stats = wordTable[key];
                maxWordRank = System.Math.Max(maxWordRank, stats.Rank);
                ListViewItem item = new ListViewItem(stats.Position.ToString());
                item.SubItems.Add(stats.Token);
                item.SubItems.Add(stats.Rank.ToString());
                item.SubItems.Add(stats.Occurences.ToString());
                lvWords.Items.Add(item);
            }

            ITokenTable charsetTable = table.CharsetTable;

            foreach (string key in charsetTable.Keys)
            {
                ITokenStats stats = charsetTable[key];
                maxCharRank = System.Math.Max(maxCharRank, stats.Rank);
                ListViewItem item = new ListViewItem(stats.Position.ToString());
                item.SubItems.Add(stats.Token);
                item.SubItems.Add(stats.Rank.ToString());
                item.SubItems.Add(stats.Occurences.ToString());
                lvCharset.Items.Add(item);
            }

            this.tbTokenRanks.Text = maxRank.ToString();
            this.tbWordRanks.Text  = maxWordRank.ToString();
            this.tbCharRanks.Text  = maxCharRank.ToString();

            double            maxScore = 0;
            List <TableVoter> tables   = new List <TableVoter>();

            foreach (string key in this.m_Model.Keys)
            {
                TableVoter tableVoter = new TableVoter(key, table.ComparisonScore(this.m_Model[key], 0));
                maxScore = Math.Max(maxScore, tableVoter.Score);
                tables.Add(tableVoter);
            }
            tables.Sort();

            foreach (TableVoter voter in tables)
            {
                if (voter.Language != lang)
                {
                    voter.Score /= maxScore;
                    //     voter.Score = 100 - voter.Score *200;
                    voter.Score = Math.Max(0, 100 - (voter.Score * 150));

                    // from 90 on there is not enough similarity to be wrongly detected
                    if (voter.Score > 90)
                    {
                        continue;
                    }
                    ListViewItem item = new ListViewItem(voter.Language);

                    item.SubItems.Add(GetLanguageCulture(voter.Language).DisplayName.ToString());
                    item.SubItems.Add(voter.Score.ToString("0.00"));
                    listView3.Items.Add(item);
                }
            }


            lvNGrams.EndUpdate();
            lvWords.EndUpdate();
            listView3.EndUpdate();
            lvCharset.EndUpdate();
        }
示例#5
0
        private void cbLangCompare_SelectedIndexChanged(object sender, EventArgs e)
        {
            this.lvAnalyzeCharsetResult.BeginUpdate();
            this.lvAnalyzeCharsetResult.Items.Clear();
            this.lvAnalyzeNGramsResult.BeginUpdate();
            this.lvAnalyzeNGramsResult.Items.Clear();
            this.lvAnalyzeWordResult.BeginUpdate();
            this.lvAnalyzeWordResult.Items.Clear();

            if (this.cbLangCompare.SelectedIndex != -1)
            {
                TokenTable  testTable    = new TokenTable(this.tbSource.Text);
                ITokenTable compareTable = this.m_Model[((CultureInfo)cbLangCompare.SelectedItem).TwoLetterISOLanguageName];

                List <TokenStats> scores = new List <TokenStats>();
                int score = compareTable.Count * testTable.Count;

                foreach (ITokenStats test in testTable.Values)
                {
                    TokenStats newScore = new TokenStats(test.Token);

                    int otherRank = compareTable.RankOf(test.Token);
                    if (otherRank == -1)
                    {
                        newScore.Occurences = compareTable.Count;
                        score -= compareTable.Count;
                    }
                    else
                    {
                        int val = System.Math.Abs(test.Rank - otherRank);
                        newScore.Occurences = val;
                        // abuse the ran field to store the occurences...
                        newScore.Rank = test.Occurences;
                        score        -= val;
                        scores.Add(newScore);
                    }
                }
                tbSumTokens.Text = score.ToString();

                scores.Sort();
                for (int i = scores.Count - 1; i > -1; i--)
                {
                    TokenStats   stats = scores[i];
                    ListViewItem item  = new ListViewItem(stats.Token);
                    item.SubItems.Add(stats.Occurences.ToString());
                    item.SubItems.Add(stats.Rank.ToString());
                    lvAnalyzeNGramsResult.Items.Add(item);
                }



                scores = new List <TokenStats>();
                score  = compareTable.WordTable.Count * testTable.WordTable.Count;

                foreach (ITokenStats test in testTable.WordTable.Values)
                {
                    TokenStats newScore = new TokenStats(test.Token);

                    int otherRank = compareTable.WordTable.RankOf(test.Token);
                    if (otherRank == -1)
                    {
                        newScore.Occurences = compareTable.Ranks;
                    }
                    else
                    {
                        int val = System.Math.Abs(test.Rank - otherRank);
                        newScore.Occurences = val;
                        newScore.Rank       = test.Occurences;
                        score -= val;
                        scores.Add(newScore);
                    }
                }
                int    hits    = 0;
                Double wsScore = compareTable.WordComparisonScore(testTable, 0, ref hits);

                tbSumWords.Text = wsScore.ToString("0.00") + " (" + hits + ")";

                scores.Sort();
                for (int i = scores.Count - 1; i > -1; i--)
                {
                    TokenStats   stats = scores[i];
                    ListViewItem item  = new ListViewItem(stats.Token);
                    item.SubItems.Add(stats.Occurences.ToString());
                    item.SubItems.Add(stats.Rank.ToString());
                    lvAnalyzeWordResult.Items.Add(item);
                }



                scores = new List <TokenStats>();
                score  = compareTable.CharsetTable.Count * testTable.CharsetTable.Count;

                foreach (ITokenStats test in testTable.CharsetTable.Values)
                {
                    TokenStats newScore = new TokenStats(test.Token);

                    int otherRank = compareTable.CharsetTable.RankOf(test.Token);
                    if (otherRank == -1)
                    {
                        newScore.Occurences = compareTable.Ranks;
                    }
                    else
                    {
                        int val = System.Math.Abs(test.Rank - otherRank);
                        newScore.Occurences = val;
                        newScore.Rank       = test.Occurences;
                        score -= val;
                        scores.Add(newScore);
                    }
                }
                textBox1.Text = score.ToString();

                scores.Sort();
                for (int i = scores.Count - 1; i > -1; i--)
                {
                    TokenStats   stats = scores[i];
                    ListViewItem item  = new ListViewItem(stats.Token);
                    item.SubItems.Add(stats.Occurences.ToString());
                    item.SubItems.Add(stats.Rank.ToString());
                    lvAnalyzeCharsetResult.Items.Add(item);
                }
            }

            this.lvAnalyzeNGramsResult.EndUpdate();
            this.lvAnalyzeWordResult.EndUpdate();
            this.lvAnalyzeCharsetResult.EndUpdate();
        }
示例#6
0
        public ICategoryList ClassifyText(string text, int maxResults)
        {
            long           startTime           = DateTime.Now.Ticks;
            ListDictionary results             = new ListDictionary();
            Dictionary <string, double> scores = new Dictionary <string, double>();
            TokenTable tblTest   = new TokenTable(text);
            double     maxScore  = 0;
            double     threshold = 0;


            List <TokenVoter> charsetVoters = new List <TokenVoter>();

            // collect stats based on charset (first filter)
            foreach (string category in this.Keys)
            {
                ITokenTable catTable = this[category];
                if (!catTable.Enabled)
                {
                    continue;
                }
                double score = catTable.CharsetTable.CharsetComparisonScore(tblTest.CharsetTable, threshold);

                if (score > maxScore)
                {
                    maxScore  = score;
                    threshold = (maxScore * this.m_Threshold);
                }
                if (score > threshold)
                {
                    charsetVoters.Add(new TokenVoter(category, score));
                }
            }

            // chinese does not have a "Charset"... so to be sure....
            if ((charsetVoters.Count < 3) && (this.Keys.Contains("zh")))
            {
                charsetVoters.Add(new TokenVoter("zh"));
            }

            charsetVoters.Sort();
            for (int i = charsetVoters.Count - 1; i > -1; i--)
            {
                if (charsetVoters[i].Score < threshold)
                {
                    charsetVoters.RemoveAt(i);
                }
            }


            maxScore = 0;;
            // collect scores for each table
            int maxWordHits = 0;

            threshold = 0;
            foreach (TokenVoter charVoter in charsetVoters)
            {
                ITokenTable catTable = this[charVoter.Category];
                if (!catTable.Enabled)
                {
                    continue;
                }
                int    hits  = 0;
                double score = catTable.WordComparisonScore(tblTest, threshold, ref hits);
                if (hits > maxWordHits)
                {
                    maxWordHits = hits;
                }

                if (score > maxScore)
                {
                    maxScore  = score;
                    threshold = (maxScore * this.m_Threshold);
                }
                if (score > threshold)
                {
                    scores.Add(charVoter.Category, score);
                }
            }

            double            sumScore = 0;
            List <TokenVoter> voters   = new List <TokenVoter>();

            if (scores.Count == 0)
            {
                maxScore = charsetVoters[0].Score;;
                // take the voters from the closed charsert
                foreach (TokenVoter v in charsetVoters)
                {
                    scores.Add(v.Category, v.Score);
                }
            }
            threshold = (maxScore * m_Threshold);


            // copy the scores to a sorted voters list
            foreach (string key in scores.Keys)
            {
                /*	if ((long)scores[key] < threshold)
                 *      continue;
                 */
                // calc sum score
                double score = scores[key];

                /*
                 * if (maxWordHits < 1)
                 * {
                 *  score = 0; //  score > 0 ? 1 : 0;
                 * }
                 * else*/
                if (score > threshold)
                {
                    score /= maxScore;
                    if (maxWordHits > 0)
                    {
                        score /= maxWordHits;
                    }
                    score    *= 100;
                    sumScore += score;
                    voters.Add(new TokenVoter(key, score));
                }
            }


            if (voters.Count > 1)
            {
                if (sumScore > 0)
                {
                    voters.Sort();
                    // cleanup voters and rebalance if more than 3 voters...
                    if (voters.Count > m_MaxVoters)
                    {
                        sumScore = 0;
                        for (int i = 0; i < m_MaxVoters; i++)
                        {
                            ((TokenVoter)voters[i]).Score -= ((TokenVoter)voters[m_MaxVoters]).Score;
                            sumScore += ((TokenVoter)voters[i]).Score;
                        }
                        voters.RemoveRange(m_MaxVoters, voters.Count - m_MaxVoters);
                    }
                }
            }

            // now normalize results..
            // the results are not an absolute confidence
            // but relative
            if (voters.Count == 1)
            {
                // only one voter, so it we are 100% sure that's the one!
                ScoreHolder newScore = new ScoreHolder(100);
                results.Add(((TokenVoter)voters[0]).Category, newScore);
            }
            else
            {
                for (int i = 0; i < voters.Count; i++)
                {
                    TokenVoter stats = voters[i] as TokenVoter;

                    double percScore = sumScore > 0 ?   (stats.Score / sumScore) * 100 : 0;
                    results.Add(stats.Category, new ScoreHolder(percScore));
                }


                // if we have more than one possible result
                // we will try to disambiguate it by checking for
                // very common words
                if ((results.Count == 0) || (results.Count > 1))
                {
                    scores.Clear();
                    maxScore  = 0;
                    threshold = 0;
                    // collect scores for each table
                    foreach (string category in results.Keys)
                    {
                        ITokenTable catTable = (ITokenTable)this[category];
                        // threshold = tblTest.WordTable. Ranks*catTable.WordTable.Count;
                        double score = catTable.ComparisonScore(tblTest, threshold);
                        if (score > 0)
                        {
                            maxScore = System.Math.Max(maxScore, score);
                            scores.Add(category, score);
                        }
                    }
                    // got results?
                    if (scores.Count > 0)
                    {
                        sumScore = 0;
                        // copy the scores to a sorted voters list
                        voters.Clear();
                        foreach (string key in scores.Keys)
                        {
                            // calc sum score
                            sumScore += scores[key];
                            voters.Add(new TokenVoter(key, scores[key]));
                        }
                        voters.Sort();


                        // now normalize results..
                        // the results are not an absolute confidence
                        // but relative
                        if (voters.Count == 1)
                        {
                            // only one voter, so all other results are only 3/4 value
                            foreach (string category in results.Keys)
                            {
                                if (category != ((TokenVoter)voters[0]).Category)
                                {
                                    ((ScoreHolder)results[category]).DevideScore(0.75);
                                }
                            }
                        }
                        else
                        {
                            for (int i = 0; i < voters.Count; i++)
                            {
                                TokenVoter stats = voters[i] as TokenVoter;

                                double percScore = (stats.Score / sumScore) * 200;
                                ((ScoreHolder)results[stats.Category]).AddScore(percScore);
                            }
                            foreach (string category in results.Keys)
                            {
                                ((ScoreHolder)results[category]).DevideScore(0.75);
                            }
                        }
                    }
                }
            }
            // now build a proper result..
            voters.Clear();
            foreach (string key in results.Keys)
            {
                voters.Add(new TokenVoter(key, ((ScoreHolder)results[key]).Score));
            }
            voters.Sort();

            /*
             * // Do a distance to next boos
             * for (int i = 0; i < voters.Count-1; i++)
             * {
             *  voters[i].Score += voters[i].Score - voters[i + 1].Score;
             * }
             */

            // reduce to maximum results
            if (voters.Count > maxResults)
            {
                voters.RemoveRange(maxResults, voters.Count - maxResults);
            }


            // re-weight...
            double dSumScore = 0;

            foreach (TokenVoter voter in voters)
            {
                dSumScore += voter.Score;
            }
            results.Clear();
            foreach (TokenVoter voter in voters)
            {
                results.Add(voter.Category, new ScoreHolder((voter.Score / dSumScore) * 100));
            }
//			ArrayList resultList = new ArrayList(results.Values);
//			resultList.Sort
            CategoryList result = new CategoryList();

            foreach (string category in results.Keys)
            {
                result.Add(new Category(category, ((ScoreHolder)results[category]).Score));
            }
            result.Sort();
#if DIALOGUEMASTER
            if (UseCounters)
            {
                m_Counters.Classifications.Increment();
                m_Counters.ClassificationsPerSecond.Increment();
                m_Counters.ComparisonTime.IncrementBy(DateTime.Now.Ticks - startTime);
                m_Counters.ComparisonTimeBase.Increment();
            }
#endif
            tblTest.Clear();
            return(result);
        }