コード例 #1
0
        /// <summary>
        /// Classify an unknown document using network knowledge from training documents
        /// </summary>
        /// <param name="doc">The unknown document to classify</param>
        /// <returns>Category of document based on prior knowledge</returns>
        public Dictionary <string, ClassificationResults> ClassifyUnknownDocument(Doc doc)
        {
            Dictionary <string, ClassificationResults> returnResults = new Dictionary <string, ClassificationResults>();
            Dictionary <Category, double> overallProbs       = new Dictionary <Category, double>();
            Dictionary <Category, double> overallProbsNGrams = new Dictionary <Category, double>();
            Dictionary <Category, double> overallProbsTfIdf  = new Dictionary <Category, double>();

            foreach (PartyData p in Data)
            {
                // Calculate probability using words from document
                foreach (Word word in doc.Words)
                {
                    Word match = p.Words.Where(x => x.Key == word.Key).FirstOrDefault();
                    if (match != null)
                    {
                        if (overallProbs.TryGetValue(p.GetCategory(), out double prob))
                        {
                            overallProbs[p.GetCategory()] = prob + Math.Log(match.RelativeFrequency);
                        }
                        else
                        {
                            overallProbs[p.GetCategory()] = Math.Log(match.RelativeFrequency);
                        }
                    }
                }

                overallProbs[p.GetCategory()] += Math.Log(p.Probability);

                // Calculate probability, this time using N-Grams not words.
                foreach (Word ngram in doc.NGrams)
                {
                    Word match = p.NGrams.Where(x => x.Key == ngram.Key).FirstOrDefault();
                    if (match != null)
                    {
                        if (overallProbsNGrams.TryGetValue(p.GetCategory(), out double prob))
                        {
                            overallProbsNGrams[p.GetCategory()] = prob + Math.Log(match.RelativeFrequency);
                        }
                        else
                        {
                            overallProbsNGrams[p.GetCategory()] = Math.Log(match.RelativeFrequency);
                        }
                    }
                }
                overallProbsNGrams[p.GetCategory()] += Math.Log(p.Probability);

                // Same as above, instead using inverse document frequency instead of relative frequency
                foreach (Word word in doc.Words)
                {
                    Word match = p.Words.Where(x => x.Key == word.Key).FirstOrDefault();
                    if (match != null)
                    {
                        if (overallProbsTfIdf.TryGetValue(p.GetCategory(), out double prob))
                        {
                            overallProbsTfIdf[p.GetCategory()] = prob + Math.Log(match.Frequency * (TotalDocs / match.DocumentFrequency));
                        }
                        else
                        {
                            overallProbsTfIdf[p.GetCategory()] = Math.Log(match.Frequency * (TotalDocs / match.DocumentFrequency));
                        }
                    }
                }
                overallProbsTfIdf[p.GetCategory()] += Math.Log(p.Probability);
            }
            // Create results object and set all relevant variables
            ClassificationResults results      = new ClassificationResults();
            ClassificationResults nGramResults = new ClassificationResults();
            ClassificationResults tfIdfResults = new ClassificationResults();

            // Calculate percentages of certainty
            results.SetConservativePercentage(((overallProbs.Where(x => x.Key == Category.CONSERVATIVE).FirstOrDefault().Value * -1) / overallProbs.Sum(x => x.Value) * -1) * 100);
            results.SetCoalitionPercentage(((overallProbs.Where(x => x.Key == Category.COALITION).FirstOrDefault().Value * -1) / overallProbs.Sum(x => x.Value) * -1) * 100);
            results.SetLabourPercentage(((overallProbs.Where(x => x.Key == Category.LABOUR).FirstOrDefault().Value * -1) / overallProbs.Sum(x => x.Value) * -1) * 100);

            nGramResults.SetConservativePercentage(((overallProbsNGrams.Where(x => x.Key == Category.CONSERVATIVE).FirstOrDefault().Value * -1) / overallProbsNGrams.Sum(x => x.Value) * -1) * 100);
            nGramResults.SetCoalitionPercentage(((overallProbsNGrams.Where(x => x.Key == Category.COALITION).FirstOrDefault().Value * -1) / overallProbsNGrams.Sum(x => x.Value) * -1) * 100);
            nGramResults.SetLabourPercentage(((overallProbsNGrams.Where(x => x.Key == Category.LABOUR).FirstOrDefault().Value * -1) / overallProbsNGrams.Sum(x => x.Value) * -1) * 100);

            tfIdfResults.SetConservativePercentage(((overallProbsTfIdf.Where(x => x.Key == Category.CONSERVATIVE).FirstOrDefault().Value * -1) / overallProbsTfIdf.Sum(x => x.Value) * -1) * 100);
            tfIdfResults.SetCoalitionPercentage(((overallProbsTfIdf.Where(x => x.Key == Category.COALITION).FirstOrDefault().Value * -1) / overallProbsTfIdf.Sum(x => x.Value) * -1) * 100);
            tfIdfResults.SetLabourPercentage(((overallProbsTfIdf.Where(x => x.Key == Category.LABOUR).FirstOrDefault().Value * -1) / overallProbsTfIdf.Sum(x => x.Value) * -1) * 100);

            // Add results to dictionary ready for return
            returnResults["normal"] = results;
            returnResults["ngram"]  = nGramResults;
            returnResults["tfidf"]  = tfIdfResults;

            return(returnResults);
        }