Example #1
0
        public void AddFile(string category, string fileName, List <char> charsetFilter)
        {
            TokenTable newTable = new TokenTable(MAX_TOKENS_DEFAULT);

            newTable.CreateFromFile(fileName, charsetFilter);
            this.Add(category, newTable);
        }
Example #2
0
		public TokenTable(int maxTokens) 		{
			this.m_WordTable = new TokenTable(this.m_MaxTokens, true);
            this.m_CharsetTable = new TokenTable(50, false);
			this.m_MaxTokens = maxTokens;
#if DIALOGUEMASTER
            this.m_InnerDict = new RemotableValuesDictionary<string, ITokenStats>(m_MaxTokens);
            if (UseCounters && (m_Counters == null))
            {
                Installer.InstallCounters();
                m_Counters = new TableCounters();
            }
#else
            this.m_InnerDict = new Dictionary<string, ITokenStats>(m_MaxTokens);
#endif
        }
Example #3
0
        public void GetObjectData(SerializationInfo info, StreamingContext context)
        {
            String[]     objArray1 = new string[this.Count];
            TokenTable[] objArray2 = new TokenTable[this.Count];
            int          i         = 0;

            foreach (String key in this.Keys)
            {
                objArray1[i] = key;
                i++;
            }
            i = 0;
            foreach (TokenTable val in this.Values)
            {
                objArray2[i] = val;
                i++;
            }
            info.AddValue("Keys", objArray1, typeof(String[]));
            info.AddValue("Values", objArray2, typeof(TokenTable[]));
        }
Example #4
0
		public new void OnDeserialization(object sender)
		{
			if (this.m_siInfo == null)
			{
				throw new SerializationException("Something went wrong during deserialization");
			}
			this.m_MaxTokens = this.m_siInfo.GetInt32("MaxTokens");
			this.m_MaxWordLen = this.m_siInfo.GetInt32("MaxWordLen");
			this.m_Ranks = this.m_siInfo.GetInt32("Ranks");
			this.m_WordTable = this.m_siInfo.GetValue("WordTable", typeof(TokenTable)) as TokenTable;

			String[] objArray1 = (String[])this.m_siInfo.GetValue("Keys",typeof(String[]));
			TokenStats[] objArray2 = (TokenStats[])this.m_siInfo.GetValue("Values",typeof(TokenStats[]));

			for(int i=0;i<objArray1.Length;i++)
				this.Add(objArray1[i],objArray2[i]);


            // rebuild CharTable
            this.m_CharsetTable = new TokenTable();
            this.BuildCharTable();
		}
Example #5
0
        public ICategoryList ClassifyText(string text, int maxResults)
        {
            long           startTime           = DateTime.Now.Ticks;
            ListDictionary results             = new ListDictionary();
            Dictionary <string, double> scores = new Dictionary <string, double>();
            TokenTable tblTest   = new TokenTable(text);
            double     maxScore  = 0;
            double     threshold = 0;


            List <TokenVoter> charsetVoters = new List <TokenVoter>();

            // collect stats based on charset (first filter)
            foreach (string category in this.Keys)
            {
                ITokenTable catTable = this[category];
                if (!catTable.Enabled)
                {
                    continue;
                }
                double score = catTable.CharsetTable.CharsetComparisonScore(tblTest.CharsetTable, threshold);

                if (score > maxScore)
                {
                    maxScore  = score;
                    threshold = (maxScore * this.m_Threshold);
                }
                if (score > threshold)
                {
                    charsetVoters.Add(new TokenVoter(category, score));
                }
            }

            // chinese does not have a "Charset"... so to be sure....
            if ((charsetVoters.Count < 3) && (this.Keys.Contains("zh")))
            {
                charsetVoters.Add(new TokenVoter("zh"));
            }

            charsetVoters.Sort();
            for (int i = charsetVoters.Count - 1; i > -1; i--)
            {
                if (charsetVoters[i].Score < threshold)
                {
                    charsetVoters.RemoveAt(i);
                }
            }


            maxScore = 0;;
            // collect scores for each table
            int maxWordHits = 0;

            threshold = 0;
            foreach (TokenVoter charVoter in charsetVoters)
            {
                ITokenTable catTable = this[charVoter.Category];
                if (!catTable.Enabled)
                {
                    continue;
                }
                int    hits  = 0;
                double score = catTable.WordComparisonScore(tblTest, threshold, ref hits);
                if (hits > maxWordHits)
                {
                    maxWordHits = hits;
                }

                if (score > maxScore)
                {
                    maxScore  = score;
                    threshold = (maxScore * this.m_Threshold);
                }
                if (score > threshold)
                {
                    scores.Add(charVoter.Category, score);
                }
            }

            double            sumScore = 0;
            List <TokenVoter> voters   = new List <TokenVoter>();

            if (scores.Count == 0)
            {
                maxScore = charsetVoters[0].Score;;
                // take the voters from the closed charsert
                foreach (TokenVoter v in charsetVoters)
                {
                    scores.Add(v.Category, v.Score);
                }
            }
            threshold = (maxScore * m_Threshold);


            // copy the scores to a sorted voters list
            foreach (string key in scores.Keys)
            {
                /*	if ((long)scores[key] < threshold)
                 *      continue;
                 */
                // calc sum score
                double score = scores[key];

                /*
                 * if (maxWordHits < 1)
                 * {
                 *  score = 0; //  score > 0 ? 1 : 0;
                 * }
                 * else*/
                if (score > threshold)
                {
                    score /= maxScore;
                    if (maxWordHits > 0)
                    {
                        score /= maxWordHits;
                    }
                    score    *= 100;
                    sumScore += score;
                    voters.Add(new TokenVoter(key, score));
                }
            }


            if (voters.Count > 1)
            {
                if (sumScore > 0)
                {
                    voters.Sort();
                    // cleanup voters and rebalance if more than 3 voters...
                    if (voters.Count > m_MaxVoters)
                    {
                        sumScore = 0;
                        for (int i = 0; i < m_MaxVoters; i++)
                        {
                            ((TokenVoter)voters[i]).Score -= ((TokenVoter)voters[m_MaxVoters]).Score;
                            sumScore += ((TokenVoter)voters[i]).Score;
                        }
                        voters.RemoveRange(m_MaxVoters, voters.Count - m_MaxVoters);
                    }
                }
            }

            // now normalize results..
            // the results are not an absolute confidence
            // but relative
            if (voters.Count == 1)
            {
                // only one voter, so it we are 100% sure that's the one!
                ScoreHolder newScore = new ScoreHolder(100);
                results.Add(((TokenVoter)voters[0]).Category, newScore);
            }
            else
            {
                for (int i = 0; i < voters.Count; i++)
                {
                    TokenVoter stats = voters[i] as TokenVoter;

                    double percScore = sumScore > 0 ?   (stats.Score / sumScore) * 100 : 0;
                    results.Add(stats.Category, new ScoreHolder(percScore));
                }


                // if we have more than one possible result
                // we will try to disambiguate it by checking for
                // very common words
                if ((results.Count == 0) || (results.Count > 1))
                {
                    scores.Clear();
                    maxScore  = 0;
                    threshold = 0;
                    // collect scores for each table
                    foreach (string category in results.Keys)
                    {
                        ITokenTable catTable = (ITokenTable)this[category];
                        // threshold = tblTest.WordTable. Ranks*catTable.WordTable.Count;
                        double score = catTable.ComparisonScore(tblTest, threshold);
                        if (score > 0)
                        {
                            maxScore = System.Math.Max(maxScore, score);
                            scores.Add(category, score);
                        }
                    }
                    // got results?
                    if (scores.Count > 0)
                    {
                        sumScore = 0;
                        // copy the scores to a sorted voters list
                        voters.Clear();
                        foreach (string key in scores.Keys)
                        {
                            // calc sum score
                            sumScore += scores[key];
                            voters.Add(new TokenVoter(key, scores[key]));
                        }
                        voters.Sort();


                        // now normalize results..
                        // the results are not an absolute confidence
                        // but relative
                        if (voters.Count == 1)
                        {
                            // only one voter, so all other results are only 3/4 value
                            foreach (string category in results.Keys)
                            {
                                if (category != ((TokenVoter)voters[0]).Category)
                                {
                                    ((ScoreHolder)results[category]).DevideScore(0.75);
                                }
                            }
                        }
                        else
                        {
                            for (int i = 0; i < voters.Count; i++)
                            {
                                TokenVoter stats = voters[i] as TokenVoter;

                                double percScore = (stats.Score / sumScore) * 200;
                                ((ScoreHolder)results[stats.Category]).AddScore(percScore);
                            }
                            foreach (string category in results.Keys)
                            {
                                ((ScoreHolder)results[category]).DevideScore(0.75);
                            }
                        }
                    }
                }
            }
            // now build a proper result..
            voters.Clear();
            foreach (string key in results.Keys)
            {
                voters.Add(new TokenVoter(key, ((ScoreHolder)results[key]).Score));
            }
            voters.Sort();

            /*
             * // Do a distance to next boos
             * for (int i = 0; i < voters.Count-1; i++)
             * {
             *  voters[i].Score += voters[i].Score - voters[i + 1].Score;
             * }
             */

            // reduce to maximum results
            if (voters.Count > maxResults)
            {
                voters.RemoveRange(maxResults, voters.Count - maxResults);
            }


            // re-weight...
            double dSumScore = 0;

            foreach (TokenVoter voter in voters)
            {
                dSumScore += voter.Score;
            }
            results.Clear();
            foreach (TokenVoter voter in voters)
            {
                results.Add(voter.Category, new ScoreHolder((voter.Score / dSumScore) * 100));
            }
//			ArrayList resultList = new ArrayList(results.Values);
//			resultList.Sort
            CategoryList result = new CategoryList();

            foreach (string category in results.Keys)
            {
                result.Add(new Category(category, ((ScoreHolder)results[category]).Score));
            }
            result.Sort();
#if DIALOGUEMASTER
            if (UseCounters)
            {
                m_Counters.Classifications.Increment();
                m_Counters.ClassificationsPerSecond.Increment();
                m_Counters.ComparisonTime.IncrementBy(DateTime.Now.Ticks - startTime);
                m_Counters.ComparisonTimeBase.Increment();
            }
#endif
            tblTest.Clear();
            return(result);
        }