Пример #1
0
        private List <Document> preprocessDataset(String directoryUrl)
        {
            List <Document> dataset = new List <Document>();

            string baseDirPath = Path.GetDirectoryName(Path.GetDirectoryName(System.IO.Directory.GetCurrentDirectory()));

            foreach (string file in Directory.EnumerateFiles(baseDirPath + @"\dataset", "*.json"))
            {
                string         json     = File.ReadAllText(file);
                List <DocItem> docItems = JsonConvert.DeserializeObject <List <DocItem> >(json);

                Document document;

                foreach (var item in docItems)
                {
                    if (item.topics == null || item.topics.Length < 0)
                    {
                        continue;
                    }
                    //for each doc - tokenize its body and convert it into a Document object.
                    document            = TextTokenizer.tokenize(item.title + " " + item.body);
                    document.categories = item.topics.ToList <String>();
                    dataset.Add(document);
                }
            }
            return(dataset);
        }
Пример #2
0
        /// <summary>
        /// Preprocesses the original dataset and converts it to a List of Documents.
        /// </summary>
        /// <param name="trainingDataset"> </param>
        /// <returns>  </returns>
        private IList <Document> preprocessDataset(IDictionary <string, String[]> trainingDataset)
        {
            IList <Document> dataset = new List <Document>();

            string category;

            string[] examples;

            Document doc;

            IEnumerator <KeyValuePair <string, String[]> > it = trainingDataset.GetEnumerator();

            //loop through all the categories and training examples
            while (it.MoveNext())
            {
                KeyValuePair <string, String[]> entry = it.Current;
                category = entry.Key;
                examples = entry.Value;

                for (int i = 0; i < examples.Length; ++i)
                {
                    //for each example in the category tokenize its text and convert it into a Document object.
                    doc          = TextTokenizer.tokenize(examples[i]);
                    doc.category = category;
                    dataset.Add(doc);

                    //examples[i] = null; //try freeing some memory
                }

                //it.remove(); //try freeing some memory
            }

            return(dataset);
        }
Пример #3
0
        public List <String> predict(String text, int topKCategories = 3)
        {
            if (knowledgeBase == null)
            {
                throw new ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it.");
            }

            //Tokenizes the text and creates a new document
            Document doc = TextTokenizer.tokenize(text);
            double   occurrences;

            //String maxScoreCategory = null;
            //Double maxScore = Double.MinValue;

            Dictionary <String, double> predictionScores = new Dictionary <string, double>();

            foreach (var categoryCounts in knowledgeBase.logPriors)
            {
                double logprob = categoryCounts.Value;
                //foreach feature of the document
                foreach (var tokenCount in doc.tokens)
                {
                    if (!knowledgeBase.logConditionalProbability.ContainsKey(tokenCount.Key))
                    {
                        continue; //if the feature does not exist just skip it
                    }

                    occurrences = tokenCount.Value; //get its occurrences in text

                    if (knowledgeBase.logConditionalProbability[tokenCount.Key].ContainsKey(categoryCounts.Key))
                    {
                        logprob += knowledgeBase.logConditionalProbability[tokenCount.Key][categoryCounts.Key]; //multiply loglikelihood score with occurrences
                    }
                }
                predictionScores.Add(categoryCounts.Key, logprob);

                //if (categoryCounts.Value > maxScore)
                //{
                //    maxScore = categoryCounts.Value;
                //    maxScoreCategory = categoryCounts.Key;
                //}
            }

            var list = predictionScores.ToList();

            list.Sort((pair1, pair2) => { return(pair2.Value.CompareTo(pair1.Value)); });
            List <string> result = new List <string>();

            foreach (var l in list)
            {
                if (l.Value > 0.0)
                {
                    result.Add(l.Key);
                }
            }
            return(result.Count >= topKCategories?result.GetRange(0, topKCategories) : result);  //return the categoies with positive odds
        }
Пример #4
0
        /// <summary>
        /// Predicts the category of a text by using an already trained classifier
        /// and returns its category.
        /// </summary>
        /// <param name="text"> </param>
        /// <returns> </returns>
        /// <exception cref="IllegalArgumentException"> </exception>
        public virtual string predict(string text)
        {
            if (knowledgeBase == null)
            {
                throw new System.ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it.");
            }

            //Tokenizes the text and creates a new document
            Document doc = TextTokenizer.tokenize(text);


            string category;
            string feature;
            int    occurrences;
            double?logprob;

            string maxScoreCategory = null;
            double?maxScore         = double.NegativeInfinity;

            //Map<String, Double> predictionScores = new HashMap<>();
            foreach (KeyValuePair <string, double> entry1 in knowledgeBase.logPriors)
            {
                category = entry1.Key;
                logprob  = entry1.Value; //intialize the scores with the priors

                //foreach feature of the document
                foreach (KeyValuePair <string, int> entry2 in doc.tokens)
                {
                    feature = entry2.Key;

                    if (!knowledgeBase.logLikelihoods.ContainsKey(feature))
                    {
                        continue; //if the feature does not exist in the knowledge base skip it
                    }

                    occurrences = entry2.Value;                                               //get its occurrences in text

                    logprob += occurrences * knowledgeBase.logLikelihoods[feature][category]; //multiply loglikelihood score with occurrences
                }
                //predictionScores.put(category, logprob);

                if (logprob > maxScore)
                {
                    maxScore         = logprob;
                    maxScoreCategory = category;
                }
            }

            return(maxScoreCategory); //return the category with heighest score
        }