public List <string> GetMostImportantWords(DatasetRepresentation datasetRepresentation)
        {
            ArgumentValidator.ValidateObject(datasetRepresentation);
            ArgumentValidator.ValidateNotEmptyList(datasetRepresentation.Words);

            var    datasetEntropy = GetDatasetEntropy(datasetRepresentation);
            var    attributeAndInformationGainPairs = new Dictionary <string, double>();
            double datasetDocumentCount             = datasetRepresentation.DocumentTopicsLists.Count;

            for (var attributeIndex = 0; attributeIndex < datasetRepresentation.Words.Count; attributeIndex++)
            {
                var possibleValues = datasetRepresentation.GetPossibleValuesOfAttributeInDataset(attributeIndex);
                var sum            = 0d;

                foreach (var possibleValue in possibleValues)
                {
                    var subset = datasetRepresentation.ReconstructByKeepingOnlyTheseFrequencies(new List <int> {
                        possibleValue
                    }, attributeIndex);
                    sum += (subset.DocumentTopicsLists.Count / datasetDocumentCount) * GetDatasetEntropy(subset);
                }

                var attribute       = datasetRepresentation.Words[attributeIndex];
                var informationGain = datasetEntropy - sum;
                attributeAndInformationGainPairs.Add(attribute, informationGain);
            }

            return(GetTopAttributes(attributeAndInformationGainPairs));
        }
        public static DatasetRepresentation ReconstructByKeepingOnlyTheseWords(this DatasetRepresentation datasetRepresentation, List <string> wordsToKeep)
        {
            var documentCount           = datasetRepresentation.DocumentWordFrequencies.Count;
            var documentWordFrequencies = new List <Dictionary <int, int> >();

            for (int documentIndex = 0; documentIndex < documentCount; documentIndex++)
            {
                var frequency = new Dictionary <int, int>();

                for (int wordIndex = 0; wordIndex < wordsToKeep.Count; wordIndex++)
                {
                    var indexOfWordInOldRepresentation = datasetRepresentation.Words.IndexOf(wordsToKeep[wordIndex]);
                    var oldWordFrequency = datasetRepresentation.GetDocumentWordFrequency(documentIndex, indexOfWordInOldRepresentation);

                    if (oldWordFrequency > 0)
                    {
                        frequency.Add(wordIndex, oldWordFrequency);
                    }
                }

                documentWordFrequencies.Add(frequency);
            }

            return(new DatasetRepresentation
            {
                Words = new List <string>(wordsToKeep),
                DocumentWordFrequencies = documentWordFrequencies,
                DocumentTopicsLists = new List <List <string> >(datasetRepresentation.DocumentTopicsLists)
            });
        }
        public static DatasetRepresentation ReconstructByEliminatingWordsBelowAndAboveThresholds(
            this DatasetRepresentation datasetRepresentation,
            int lowerThresholdPercentage,
            int upperThresholdPercentage)
        {
            var    allWords      = new List <string>(datasetRepresentation.Words);
            var    wordsToKeep   = new List <string>();
            double documentCount = datasetRepresentation.DocumentTopicsLists.Count;

            for (var wordIndex = 0; wordIndex < allWords.Count; wordIndex++)
            {
                var word = allWords[wordIndex];
                var documentsInWhichWordAppears = 0;

                for (int documentIndex = 0; documentIndex < documentCount; documentIndex++)
                {
                    if (datasetRepresentation.GetDocumentWordFrequency(documentIndex, wordIndex) > 0)
                    {
                        documentsInWhichWordAppears++;
                    }
                }

                var apparitionPercentage = documentsInWhichWordAppears * 100 / documentCount;

                if (apparitionPercentage > lowerThresholdPercentage && apparitionPercentage < upperThresholdPercentage)
                {
                    wordsToKeep.Add(word);
                }
            }

            return(datasetRepresentation.ReconstructByKeepingOnlyTheseWords(wordsToKeep));
        }
Ejemplo n.º 4
0
        private DataColumn[] GetHeaderColumns(DatasetRepresentation datasetRepresentation)
        {
            var headerColumns = datasetRepresentation
                                .Words
                                .Select(x => new DataColumn(x))
                                .ToList();

            headerColumns.Add(new DataColumn("Topic"));

            return(headerColumns.ToArray());
        }
        public void Train(DatasetRepresentation datasetRepresentation)
        {
            ArgumentValidator.ValidateObject(datasetRepresentation);

            datasetUsedForTraining = datasetRepresentation;
            var inputsOutputsPair = GetInputsAndOutputsForDataset(datasetRepresentation);
            var k = 61;

            kNearestNeighbors = new KNearestNeighbors(k);
            kNearestNeighbors.Learn(inputsOutputsPair.Inputs, inputsOutputsPair.Outputs);
        }
        private double GetDatasetEntropy(DatasetRepresentation datasetRepresentation)
        {
            var    possibleTopics = datasetRepresentation.GetAllDistinctTopics();
            var    sum            = 0d;
            double documentCount  = datasetRepresentation.DocumentTopicsLists.Count;

            foreach (var topic in possibleTopics)
            {
                var documentsWithGivenTopic = datasetRepresentation.DocumentTopicsLists.Count(x => x.Contains(topic));
                var probability             = documentsWithGivenTopic / documentCount;
                sum += probability * Math.Log2(probability);
            }

            return(-sum);
        }
Ejemplo n.º 7
0
        private List <string[]> GetRowsForDocument(DatasetRepresentation datasetRepresentation, int documentIndex)
        {
            var documentFrequencies = datasetRepresentation.DocumentWordFrequencies[documentIndex];
            var documentTopics      = datasetRepresentation.DocumentTopicsLists[documentIndex];
            var rows = new List <string[]>();

            foreach (var documentTopic in documentTopics)
            {
                var values = new string[datasetRepresentation.Words.Count + 1];

                for (int attributeIndex = 0; attributeIndex < datasetRepresentation.Words.Count; attributeIndex++)
                {
                    values[attributeIndex] = datasetRepresentation.GetDocumentWordFrequency(documentIndex, attributeIndex).ToString();
                }

                values[^ 1] = documentTopic;
        public static string ToArffFileFormat(this DatasetRepresentation datasetRepresentation)
        {
            var stringBuilder = new StringBuilder();
            var topics        = datasetRepresentation.GetAllDistinctTopics();

            foreach (var attribute in datasetRepresentation.Words)
            {
                stringBuilder.AppendLine($"@attribute {attribute} NUMERIC");
            }

            stringBuilder.AppendLine();
            var formattedTopics = topics
                                  .Select(x => $"'{x}'")
                                  .ToList();

            stringBuilder.AppendLine(string.Join(", ", formattedTopics));
            stringBuilder.AppendLine();

            foreach (var topic in topics)
            {
                stringBuilder.AppendLine($"@topics {topic}");
            }

            stringBuilder.AppendLine();
            stringBuilder.AppendLine("@data");

            for (var documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++)
            {
                if (datasetRepresentation.DocumentTopicsLists[documentIndex].Count == 0)
                {
                    continue;
                }

                var datasetRepresentationDocumentWordFrequency = datasetRepresentation.DocumentWordFrequencies[documentIndex];
                var formattedPairs = datasetRepresentationDocumentWordFrequency
                                     .Select(x => $"{x.Key}:{x.Value}")
                                     .ToList();
                var pairsString = string.Join(',', formattedPairs);

                for (int topicIndex = 0; topicIndex < datasetRepresentation.DocumentTopicsLists[documentIndex].Count; topicIndex++)
                {
                    stringBuilder.AppendLine($"{pairsString} # {datasetRepresentation.DocumentTopicsLists[documentIndex][topicIndex]}");
                }
            }

            return(stringBuilder.ToString());
        }
        public static List <int> GetPossibleValuesOfAttributeInDataset(this DatasetRepresentation datasetRepresentation, int attributeIndex)
        {
            var possibleValues = new List <int>();

            for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++)
            {
                var value = datasetRepresentation.GetDocumentWordFrequency(documentIndex, attributeIndex);
                if (!possibleValues.Contains(value))
                {
                    possibleValues.Add(value);
                }
            }

            return(possibleValues
                   .OrderBy(x => x)
                   .ToList());
        }
Ejemplo n.º 10
0
        private DataTable GetDataTableForDataset(DatasetRepresentation datasetRepresentation)
        {
            var dataTable = new DataTable("Text mining");

            var headerColumns = GetHeaderColumns(datasetRepresentation);

            dataTable.Columns.AddRange(headerColumns);

            for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++)
            {
                var documentRows = GetRowsForDocument(datasetRepresentation, documentIndex);

                foreach (var documentRow in documentRows)
                {
                    dataTable.Rows.Add(documentRow);
                }
            }

            return(dataTable);
        }
        private InputsOutputsPair GetInputsAndOutputsForDataset(DatasetRepresentation datasetRepresentation)
        {
            var rowCount = datasetRepresentation
                           .DocumentTopicsLists
                           .Select(x => x.Count)
                           .Sum();

            var columnCount = datasetRepresentation.Words.Count;

            var inputs           = new double[rowCount][];
            var outputs          = new int[rowCount];
            var currentRowNumber = 0;

            for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++)
            {
                var documentFrequencies = datasetRepresentation.DocumentWordFrequencies[documentIndex];
                var documentTopics      = datasetRepresentation.DocumentTopicsLists[documentIndex];

                var inputRow = new double[columnCount];
                for (var attributeIndex = 0; attributeIndex < columnCount; attributeIndex++)
                {
                    inputRow[attributeIndex] = datasetRepresentation.GetDocumentWordFrequency(documentIndex, attributeIndex);
                }

                for (var topicIndex = 0; topicIndex < documentTopics.Count; topicIndex++)
                {
                    inputs[currentRowNumber]  = inputRow;
                    outputs[currentRowNumber] = topicIndex;
                    currentRowNumber++;
                }
            }


            return(new InputsOutputsPair
            {
                Inputs = inputs,
                Outputs = outputs
            });
        }
        public static DatasetRepresentation ReconstructByKeepingOnlyTheseFrequencies(
            this DatasetRepresentation datasetRepresentation,
            List <int> possibleFrequencyValues,
            int wordIndex)
        {
            var indexesOfDocumentsWithGivenFrequencyValues = new List <int>();

            for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++)
            {
                if (possibleFrequencyValues.Contains(datasetRepresentation.GetDocumentWordFrequency(documentIndex, wordIndex)))
                {
                    indexesOfDocumentsWithGivenFrequencyValues.Add(documentIndex);
                }
            }

            var newDocumentWordFrequencies = new List <Dictionary <int, int> >();

            foreach (var oldDocumentIndex in indexesOfDocumentsWithGivenFrequencyValues)
            {
                newDocumentWordFrequencies.Add(datasetRepresentation.DocumentWordFrequencies[oldDocumentIndex]);
            }

            var topics = new List <List <string> >();

            for (var oldDocumentIndex = 0; oldDocumentIndex < datasetRepresentation.DocumentTopicsLists.Count; oldDocumentIndex++)
            {
                if (indexesOfDocumentsWithGivenFrequencyValues.Contains(oldDocumentIndex))
                {
                    topics.Add(datasetRepresentation.DocumentTopicsLists[oldDocumentIndex]);
                }
            }

            return(new DatasetRepresentation
            {
                Words = datasetRepresentation.Words,
                DocumentWordFrequencies = newDocumentWordFrequencies,
                DocumentTopicsLists = topics
            });
        }
Ejemplo n.º 13
0
        public void Train(DatasetRepresentation datasetRepresentation)
        {
            ArgumentValidator.ValidateObject(datasetRepresentation);

            var dataTable = GetDataTableForDataset(datasetRepresentation);

            codeBook = new Codification(dataTable);

            var symbols = codeBook.Apply(dataTable);
            var inputs  = symbols.ToJagged <int>(datasetRepresentation.Words.Select(x => x).ToArray());
            var outputs = symbols.ToArray <int>("Topic");
            var id3LearningForDataset = GetId3LearningForDataset(datasetRepresentation);

            decisionTree = id3LearningForDataset.Learn(inputs, outputs);

            input = new string[datasetRepresentation.Words.Count, 2];
            for (var index = 0; index < datasetRepresentation.Words.Count; index++)
            {
                var datasetRepresentationWord = datasetRepresentation.Words[index];
                input[index, 0] = datasetRepresentationWord;
                input[index, 1] = "2";
            }
        }