/// <summary> /// Classify an unknown document using network knowledge from training documents /// </summary> /// <param name="doc">The unknown document to classify</param> /// <returns>Category of document based on prior knowledge</returns> public Dictionary <string, ClassificationResults> ClassifyUnknownDocument(Doc doc) { Dictionary <string, ClassificationResults> returnResults = new Dictionary <string, ClassificationResults>(); Dictionary <Category, double> overallProbs = new Dictionary <Category, double>(); Dictionary <Category, double> overallProbsNGrams = new Dictionary <Category, double>(); Dictionary <Category, double> overallProbsTfIdf = new Dictionary <Category, double>(); foreach (PartyData p in Data) { // Calculate probability using words from document foreach (Word word in doc.Words) { Word match = p.Words.Where(x => x.Key == word.Key).FirstOrDefault(); if (match != null) { if (overallProbs.TryGetValue(p.GetCategory(), out double prob)) { overallProbs[p.GetCategory()] = prob + Math.Log(match.RelativeFrequency); } else { overallProbs[p.GetCategory()] = Math.Log(match.RelativeFrequency); } } } overallProbs[p.GetCategory()] += Math.Log(p.Probability); // Calculate probability, this time using N-Grams not words. foreach (Word ngram in doc.NGrams) { Word match = p.NGrams.Where(x => x.Key == ngram.Key).FirstOrDefault(); if (match != null) { if (overallProbsNGrams.TryGetValue(p.GetCategory(), out double prob)) { overallProbsNGrams[p.GetCategory()] = prob + Math.Log(match.RelativeFrequency); } else { overallProbsNGrams[p.GetCategory()] = Math.Log(match.RelativeFrequency); } } } overallProbsNGrams[p.GetCategory()] += Math.Log(p.Probability); // Same as above, instead using inverse document frequency instead of relative frequency foreach (Word word in doc.Words) { Word match = p.Words.Where(x => x.Key == word.Key).FirstOrDefault(); if (match != null) { if (overallProbsTfIdf.TryGetValue(p.GetCategory(), out double prob)) { overallProbsTfIdf[p.GetCategory()] = prob + Math.Log(match.Frequency * (TotalDocs / match.DocumentFrequency)); } else { overallProbsTfIdf[p.GetCategory()] = Math.Log(match.Frequency * (TotalDocs / match.DocumentFrequency)); } } } overallProbsTfIdf[p.GetCategory()] += Math.Log(p.Probability); } // Create results object and set all relevant variables ClassificationResults results = new ClassificationResults(); ClassificationResults nGramResults = new ClassificationResults(); ClassificationResults tfIdfResults = new ClassificationResults(); // Calculate percentages of certainty results.SetConservativePercentage(((overallProbs.Where(x => x.Key == Category.CONSERVATIVE).FirstOrDefault().Value * -1) / overallProbs.Sum(x => x.Value) * -1) * 100); results.SetCoalitionPercentage(((overallProbs.Where(x => x.Key == Category.COALITION).FirstOrDefault().Value * -1) / overallProbs.Sum(x => x.Value) * -1) * 100); results.SetLabourPercentage(((overallProbs.Where(x => x.Key == Category.LABOUR).FirstOrDefault().Value * -1) / overallProbs.Sum(x => x.Value) * -1) * 100); nGramResults.SetConservativePercentage(((overallProbsNGrams.Where(x => x.Key == Category.CONSERVATIVE).FirstOrDefault().Value * -1) / overallProbsNGrams.Sum(x => x.Value) * -1) * 100); nGramResults.SetCoalitionPercentage(((overallProbsNGrams.Where(x => x.Key == Category.COALITION).FirstOrDefault().Value * -1) / overallProbsNGrams.Sum(x => x.Value) * -1) * 100); nGramResults.SetLabourPercentage(((overallProbsNGrams.Where(x => x.Key == Category.LABOUR).FirstOrDefault().Value * -1) / overallProbsNGrams.Sum(x => x.Value) * -1) * 100); tfIdfResults.SetConservativePercentage(((overallProbsTfIdf.Where(x => x.Key == Category.CONSERVATIVE).FirstOrDefault().Value * -1) / overallProbsTfIdf.Sum(x => x.Value) * -1) * 100); tfIdfResults.SetCoalitionPercentage(((overallProbsTfIdf.Where(x => x.Key == Category.COALITION).FirstOrDefault().Value * -1) / overallProbsTfIdf.Sum(x => x.Value) * -1) * 100); tfIdfResults.SetLabourPercentage(((overallProbsTfIdf.Where(x => x.Key == Category.LABOUR).FirstOrDefault().Value * -1) / overallProbsTfIdf.Sum(x => x.Value) * -1) * 100); // Add results to dictionary ready for return returnResults["normal"] = results; returnResults["ngram"] = nGramResults; returnResults["tfidf"] = tfIdfResults; return(returnResults); }