C# (CSharp) TextTokenizer.tokenize примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: TextTokenizer

Метод/Функция: tokenize

Примеров на hotexamples.com: 4

C# (CSharp) TextTokenizer.tokenize - 4 примера найдено. Это лучшие примеры C# (CSharp) кода для TextTokenizer.tokenize, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TextTokenize(11)

tokenize(4)

Tokenize(3)

GetWordCount(2)

GetWords(2)

Accept(1)

GetAreaPaths(1)

GetBlocks(1)

GetSeq2Tokens(1)

NextToken(1)

ReadTo(1)

ReadToEnd(1)

Run(1)

SkipIgnoreCharacters(1)

SplitToWords(1)

ToList(1)

Пример #1

Показать файл

        private List <Document> preprocessDataset(String directoryUrl)
        {
            List <Document> dataset = new List <Document>();

            string baseDirPath = Path.GetDirectoryName(Path.GetDirectoryName(System.IO.Directory.GetCurrentDirectory()));

            foreach (string file in Directory.EnumerateFiles(baseDirPath + @"\dataset", "*.json"))
            {
                string         json     = File.ReadAllText(file);
                List <DocItem> docItems = JsonConvert.DeserializeObject <List <DocItem> >(json);

                Document document;

                foreach (var item in docItems)
                {
                    if (item.topics == null || item.topics.Length < 0)
                    {
                        continue;
                    }
                    //for each doc - tokenize its body and convert it into a Document object.
                    document            = TextTokenizer.tokenize(item.title + " " + item.body);
                    document.categories = item.topics.ToList <String>();
                    dataset.Add(document);
                }
            }
            return(dataset);
        }

Пример #2

Показать файл

Файл: NaiveBayes.cs Проект: waliul-cse/NBayesDotNet

        /// <summary>
        /// Preprocesses the original dataset and converts it to a List of Documents.
        /// </summary>
        /// <param name="trainingDataset"> </param>
        /// <returns>  </returns>
        private IList <Document> preprocessDataset(IDictionary <string, String[]> trainingDataset)
        {
            IList <Document> dataset = new List <Document>();

            string category;

            string[] examples;

            Document doc;

            IEnumerator <KeyValuePair <string, String[]> > it = trainingDataset.GetEnumerator();

            //loop through all the categories and training examples
            while (it.MoveNext())
            {
                KeyValuePair <string, String[]> entry = it.Current;
                category = entry.Key;
                examples = entry.Value;

                for (int i = 0; i < examples.Length; ++i)
                {
                    //for each example in the category tokenize its text and convert it into a Document object.
                    doc          = TextTokenizer.tokenize(examples[i]);
                    doc.category = category;
                    dataset.Add(doc);

                    //examples[i] = null; //try freeing some memory
                }

                //it.remove(); //try freeing some memory
            }

            return(dataset);
        }

Пример #3

Показать файл

        public List <String> predict(String text, int topKCategories = 3)
        {
            if (knowledgeBase == null)
            {
                throw new ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it.");
            }

            //Tokenizes the text and creates a new document
            Document doc = TextTokenizer.tokenize(text);
            double   occurrences;

            //String maxScoreCategory = null;
            //Double maxScore = Double.MinValue;

            Dictionary <String, double> predictionScores = new Dictionary <string, double>();

            foreach (var categoryCounts in knowledgeBase.logPriors)
            {
                double logprob = categoryCounts.Value;
                //foreach feature of the document
                foreach (var tokenCount in doc.tokens)
                {
                    if (!knowledgeBase.logConditionalProbability.ContainsKey(tokenCount.Key))
                    {
                        continue; //if the feature does not exist just skip it
                    }

                    occurrences = tokenCount.Value; //get its occurrences in text

                    if (knowledgeBase.logConditionalProbability[tokenCount.Key].ContainsKey(categoryCounts.Key))
                    {
                        logprob += knowledgeBase.logConditionalProbability[tokenCount.Key][categoryCounts.Key]; //multiply loglikelihood score with occurrences
                    }
                }
                predictionScores.Add(categoryCounts.Key, logprob);

                //if (categoryCounts.Value > maxScore)
                //{
                //    maxScore = categoryCounts.Value;
                //    maxScoreCategory = categoryCounts.Key;
                //}
            }

            var list = predictionScores.ToList();

            list.Sort((pair1, pair2) => { return(pair2.Value.CompareTo(pair1.Value)); });
            List <string> result = new List <string>();

            foreach (var l in list)
            {
                if (l.Value > 0.0)
                {
                    result.Add(l.Key);
                }
            }
            return(result.Count >= topKCategories?result.GetRange(0, topKCategories) : result);  //return the categoies with positive odds
        }

Пример #4

Показать файл

Файл: NaiveBayes.cs Проект: waliul-cse/NBayesDotNet

        /// <summary>
        /// Predicts the category of a text by using an already trained classifier
        /// and returns its category.
        /// </summary>
        /// <param name="text"> </param>
        /// <returns> </returns>
        /// <exception cref="IllegalArgumentException"> </exception>
        public virtual string predict(string text)
        {
            if (knowledgeBase == null)
            {
                throw new System.ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it.");
            }

            //Tokenizes the text and creates a new document
            Document doc = TextTokenizer.tokenize(text);


            string category;
            string feature;
            int    occurrences;
            double?logprob;

            string maxScoreCategory = null;
            double?maxScore         = double.NegativeInfinity;

            //Map<String, Double> predictionScores = new HashMap<>();
            foreach (KeyValuePair <string, double> entry1 in knowledgeBase.logPriors)
            {
                category = entry1.Key;
                logprob  = entry1.Value; //intialize the scores with the priors

                //foreach feature of the document
                foreach (KeyValuePair <string, int> entry2 in doc.tokens)
                {
                    feature = entry2.Key;

                    if (!knowledgeBase.logLikelihoods.ContainsKey(feature))
                    {
                        continue; //if the feature does not exist in the knowledge base skip it
                    }

                    occurrences = entry2.Value;                                               //get its occurrences in text

                    logprob += occurrences * knowledgeBase.logLikelihoods[feature][category]; //multiply loglikelihood score with occurrences
                }
                //predictionScores.put(category, logprob);

                if (logprob > maxScore)
                {
                    maxScore         = logprob;
                    maxScoreCategory = category;
                }
            }

            return(maxScoreCategory); //return the category with heighest score
        }