private List <Document> preprocessDataset(String directoryUrl) { List <Document> dataset = new List <Document>(); string baseDirPath = Path.GetDirectoryName(Path.GetDirectoryName(System.IO.Directory.GetCurrentDirectory())); foreach (string file in Directory.EnumerateFiles(baseDirPath + @"\dataset", "*.json")) { string json = File.ReadAllText(file); List <DocItem> docItems = JsonConvert.DeserializeObject <List <DocItem> >(json); Document document; foreach (var item in docItems) { if (item.topics == null || item.topics.Length < 0) { continue; } //for each doc - tokenize its body and convert it into a Document object. document = TextTokenizer.tokenize(item.title + " " + item.body); document.categories = item.topics.ToList <String>(); dataset.Add(document); } } return(dataset); }
/// <summary> /// Preprocesses the original dataset and converts it to a List of Documents. /// </summary> /// <param name="trainingDataset"> </param> /// <returns> </returns> private IList <Document> preprocessDataset(IDictionary <string, String[]> trainingDataset) { IList <Document> dataset = new List <Document>(); string category; string[] examples; Document doc; IEnumerator <KeyValuePair <string, String[]> > it = trainingDataset.GetEnumerator(); //loop through all the categories and training examples while (it.MoveNext()) { KeyValuePair <string, String[]> entry = it.Current; category = entry.Key; examples = entry.Value; for (int i = 0; i < examples.Length; ++i) { //for each example in the category tokenize its text and convert it into a Document object. doc = TextTokenizer.tokenize(examples[i]); doc.category = category; dataset.Add(doc); //examples[i] = null; //try freeing some memory } //it.remove(); //try freeing some memory } return(dataset); }
public List <String> predict(String text, int topKCategories = 3) { if (knowledgeBase == null) { throw new ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it."); } //Tokenizes the text and creates a new document Document doc = TextTokenizer.tokenize(text); double occurrences; //String maxScoreCategory = null; //Double maxScore = Double.MinValue; Dictionary <String, double> predictionScores = new Dictionary <string, double>(); foreach (var categoryCounts in knowledgeBase.logPriors) { double logprob = categoryCounts.Value; //foreach feature of the document foreach (var tokenCount in doc.tokens) { if (!knowledgeBase.logConditionalProbability.ContainsKey(tokenCount.Key)) { continue; //if the feature does not exist just skip it } occurrences = tokenCount.Value; //get its occurrences in text if (knowledgeBase.logConditionalProbability[tokenCount.Key].ContainsKey(categoryCounts.Key)) { logprob += knowledgeBase.logConditionalProbability[tokenCount.Key][categoryCounts.Key]; //multiply loglikelihood score with occurrences } } predictionScores.Add(categoryCounts.Key, logprob); //if (categoryCounts.Value > maxScore) //{ // maxScore = categoryCounts.Value; // maxScoreCategory = categoryCounts.Key; //} } var list = predictionScores.ToList(); list.Sort((pair1, pair2) => { return(pair2.Value.CompareTo(pair1.Value)); }); List <string> result = new List <string>(); foreach (var l in list) { if (l.Value > 0.0) { result.Add(l.Key); } } return(result.Count >= topKCategories?result.GetRange(0, topKCategories) : result); //return the categoies with positive odds }
/// <summary> /// Predicts the category of a text by using an already trained classifier /// and returns its category. /// </summary> /// <param name="text"> </param> /// <returns> </returns> /// <exception cref="IllegalArgumentException"> </exception> public virtual string predict(string text) { if (knowledgeBase == null) { throw new System.ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it."); } //Tokenizes the text and creates a new document Document doc = TextTokenizer.tokenize(text); string category; string feature; int occurrences; double?logprob; string maxScoreCategory = null; double?maxScore = double.NegativeInfinity; //Map<String, Double> predictionScores = new HashMap<>(); foreach (KeyValuePair <string, double> entry1 in knowledgeBase.logPriors) { category = entry1.Key; logprob = entry1.Value; //intialize the scores with the priors //foreach feature of the document foreach (KeyValuePair <string, int> entry2 in doc.tokens) { feature = entry2.Key; if (!knowledgeBase.logLikelihoods.ContainsKey(feature)) { continue; //if the feature does not exist in the knowledge base skip it } occurrences = entry2.Value; //get its occurrences in text logprob += occurrences * knowledgeBase.logLikelihoods[feature][category]; //multiply loglikelihood score with occurrences } //predictionScores.put(category, logprob); if (logprob > maxScore) { maxScore = logprob; maxScoreCategory = category; } } return(maxScoreCategory); //return the category with heighest score }