public List <string> GetMostImportantWords(DatasetRepresentation datasetRepresentation) { ArgumentValidator.ValidateObject(datasetRepresentation); ArgumentValidator.ValidateNotEmptyList(datasetRepresentation.Words); var datasetEntropy = GetDatasetEntropy(datasetRepresentation); var attributeAndInformationGainPairs = new Dictionary <string, double>(); double datasetDocumentCount = datasetRepresentation.DocumentTopicsLists.Count; for (var attributeIndex = 0; attributeIndex < datasetRepresentation.Words.Count; attributeIndex++) { var possibleValues = datasetRepresentation.GetPossibleValuesOfAttributeInDataset(attributeIndex); var sum = 0d; foreach (var possibleValue in possibleValues) { var subset = datasetRepresentation.ReconstructByKeepingOnlyTheseFrequencies(new List <int> { possibleValue }, attributeIndex); sum += (subset.DocumentTopicsLists.Count / datasetDocumentCount) * GetDatasetEntropy(subset); } var attribute = datasetRepresentation.Words[attributeIndex]; var informationGain = datasetEntropy - sum; attributeAndInformationGainPairs.Add(attribute, informationGain); } return(GetTopAttributes(attributeAndInformationGainPairs)); }
public static DatasetRepresentation ReconstructByKeepingOnlyTheseWords(this DatasetRepresentation datasetRepresentation, List <string> wordsToKeep) { var documentCount = datasetRepresentation.DocumentWordFrequencies.Count; var documentWordFrequencies = new List <Dictionary <int, int> >(); for (int documentIndex = 0; documentIndex < documentCount; documentIndex++) { var frequency = new Dictionary <int, int>(); for (int wordIndex = 0; wordIndex < wordsToKeep.Count; wordIndex++) { var indexOfWordInOldRepresentation = datasetRepresentation.Words.IndexOf(wordsToKeep[wordIndex]); var oldWordFrequency = datasetRepresentation.GetDocumentWordFrequency(documentIndex, indexOfWordInOldRepresentation); if (oldWordFrequency > 0) { frequency.Add(wordIndex, oldWordFrequency); } } documentWordFrequencies.Add(frequency); } return(new DatasetRepresentation { Words = new List <string>(wordsToKeep), DocumentWordFrequencies = documentWordFrequencies, DocumentTopicsLists = new List <List <string> >(datasetRepresentation.DocumentTopicsLists) }); }
public static DatasetRepresentation ReconstructByEliminatingWordsBelowAndAboveThresholds( this DatasetRepresentation datasetRepresentation, int lowerThresholdPercentage, int upperThresholdPercentage) { var allWords = new List <string>(datasetRepresentation.Words); var wordsToKeep = new List <string>(); double documentCount = datasetRepresentation.DocumentTopicsLists.Count; for (var wordIndex = 0; wordIndex < allWords.Count; wordIndex++) { var word = allWords[wordIndex]; var documentsInWhichWordAppears = 0; for (int documentIndex = 0; documentIndex < documentCount; documentIndex++) { if (datasetRepresentation.GetDocumentWordFrequency(documentIndex, wordIndex) > 0) { documentsInWhichWordAppears++; } } var apparitionPercentage = documentsInWhichWordAppears * 100 / documentCount; if (apparitionPercentage > lowerThresholdPercentage && apparitionPercentage < upperThresholdPercentage) { wordsToKeep.Add(word); } } return(datasetRepresentation.ReconstructByKeepingOnlyTheseWords(wordsToKeep)); }
private DataColumn[] GetHeaderColumns(DatasetRepresentation datasetRepresentation) { var headerColumns = datasetRepresentation .Words .Select(x => new DataColumn(x)) .ToList(); headerColumns.Add(new DataColumn("Topic")); return(headerColumns.ToArray()); }
public void Train(DatasetRepresentation datasetRepresentation) { ArgumentValidator.ValidateObject(datasetRepresentation); datasetUsedForTraining = datasetRepresentation; var inputsOutputsPair = GetInputsAndOutputsForDataset(datasetRepresentation); var k = 61; kNearestNeighbors = new KNearestNeighbors(k); kNearestNeighbors.Learn(inputsOutputsPair.Inputs, inputsOutputsPair.Outputs); }
private double GetDatasetEntropy(DatasetRepresentation datasetRepresentation) { var possibleTopics = datasetRepresentation.GetAllDistinctTopics(); var sum = 0d; double documentCount = datasetRepresentation.DocumentTopicsLists.Count; foreach (var topic in possibleTopics) { var documentsWithGivenTopic = datasetRepresentation.DocumentTopicsLists.Count(x => x.Contains(topic)); var probability = documentsWithGivenTopic / documentCount; sum += probability * Math.Log2(probability); } return(-sum); }
private List <string[]> GetRowsForDocument(DatasetRepresentation datasetRepresentation, int documentIndex) { var documentFrequencies = datasetRepresentation.DocumentWordFrequencies[documentIndex]; var documentTopics = datasetRepresentation.DocumentTopicsLists[documentIndex]; var rows = new List <string[]>(); foreach (var documentTopic in documentTopics) { var values = new string[datasetRepresentation.Words.Count + 1]; for (int attributeIndex = 0; attributeIndex < datasetRepresentation.Words.Count; attributeIndex++) { values[attributeIndex] = datasetRepresentation.GetDocumentWordFrequency(documentIndex, attributeIndex).ToString(); } values[^ 1] = documentTopic;
public static string ToArffFileFormat(this DatasetRepresentation datasetRepresentation) { var stringBuilder = new StringBuilder(); var topics = datasetRepresentation.GetAllDistinctTopics(); foreach (var attribute in datasetRepresentation.Words) { stringBuilder.AppendLine($"@attribute {attribute} NUMERIC"); } stringBuilder.AppendLine(); var formattedTopics = topics .Select(x => $"'{x}'") .ToList(); stringBuilder.AppendLine(string.Join(", ", formattedTopics)); stringBuilder.AppendLine(); foreach (var topic in topics) { stringBuilder.AppendLine($"@topics {topic}"); } stringBuilder.AppendLine(); stringBuilder.AppendLine("@data"); for (var documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++) { if (datasetRepresentation.DocumentTopicsLists[documentIndex].Count == 0) { continue; } var datasetRepresentationDocumentWordFrequency = datasetRepresentation.DocumentWordFrequencies[documentIndex]; var formattedPairs = datasetRepresentationDocumentWordFrequency .Select(x => $"{x.Key}:{x.Value}") .ToList(); var pairsString = string.Join(',', formattedPairs); for (int topicIndex = 0; topicIndex < datasetRepresentation.DocumentTopicsLists[documentIndex].Count; topicIndex++) { stringBuilder.AppendLine($"{pairsString} # {datasetRepresentation.DocumentTopicsLists[documentIndex][topicIndex]}"); } } return(stringBuilder.ToString()); }
public static List <int> GetPossibleValuesOfAttributeInDataset(this DatasetRepresentation datasetRepresentation, int attributeIndex) { var possibleValues = new List <int>(); for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++) { var value = datasetRepresentation.GetDocumentWordFrequency(documentIndex, attributeIndex); if (!possibleValues.Contains(value)) { possibleValues.Add(value); } } return(possibleValues .OrderBy(x => x) .ToList()); }
private DataTable GetDataTableForDataset(DatasetRepresentation datasetRepresentation) { var dataTable = new DataTable("Text mining"); var headerColumns = GetHeaderColumns(datasetRepresentation); dataTable.Columns.AddRange(headerColumns); for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++) { var documentRows = GetRowsForDocument(datasetRepresentation, documentIndex); foreach (var documentRow in documentRows) { dataTable.Rows.Add(documentRow); } } return(dataTable); }
private InputsOutputsPair GetInputsAndOutputsForDataset(DatasetRepresentation datasetRepresentation) { var rowCount = datasetRepresentation .DocumentTopicsLists .Select(x => x.Count) .Sum(); var columnCount = datasetRepresentation.Words.Count; var inputs = new double[rowCount][]; var outputs = new int[rowCount]; var currentRowNumber = 0; for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++) { var documentFrequencies = datasetRepresentation.DocumentWordFrequencies[documentIndex]; var documentTopics = datasetRepresentation.DocumentTopicsLists[documentIndex]; var inputRow = new double[columnCount]; for (var attributeIndex = 0; attributeIndex < columnCount; attributeIndex++) { inputRow[attributeIndex] = datasetRepresentation.GetDocumentWordFrequency(documentIndex, attributeIndex); } for (var topicIndex = 0; topicIndex < documentTopics.Count; topicIndex++) { inputs[currentRowNumber] = inputRow; outputs[currentRowNumber] = topicIndex; currentRowNumber++; } } return(new InputsOutputsPair { Inputs = inputs, Outputs = outputs }); }
public static DatasetRepresentation ReconstructByKeepingOnlyTheseFrequencies( this DatasetRepresentation datasetRepresentation, List <int> possibleFrequencyValues, int wordIndex) { var indexesOfDocumentsWithGivenFrequencyValues = new List <int>(); for (int documentIndex = 0; documentIndex < datasetRepresentation.DocumentWordFrequencies.Count; documentIndex++) { if (possibleFrequencyValues.Contains(datasetRepresentation.GetDocumentWordFrequency(documentIndex, wordIndex))) { indexesOfDocumentsWithGivenFrequencyValues.Add(documentIndex); } } var newDocumentWordFrequencies = new List <Dictionary <int, int> >(); foreach (var oldDocumentIndex in indexesOfDocumentsWithGivenFrequencyValues) { newDocumentWordFrequencies.Add(datasetRepresentation.DocumentWordFrequencies[oldDocumentIndex]); } var topics = new List <List <string> >(); for (var oldDocumentIndex = 0; oldDocumentIndex < datasetRepresentation.DocumentTopicsLists.Count; oldDocumentIndex++) { if (indexesOfDocumentsWithGivenFrequencyValues.Contains(oldDocumentIndex)) { topics.Add(datasetRepresentation.DocumentTopicsLists[oldDocumentIndex]); } } return(new DatasetRepresentation { Words = datasetRepresentation.Words, DocumentWordFrequencies = newDocumentWordFrequencies, DocumentTopicsLists = topics }); }
public void Train(DatasetRepresentation datasetRepresentation) { ArgumentValidator.ValidateObject(datasetRepresentation); var dataTable = GetDataTableForDataset(datasetRepresentation); codeBook = new Codification(dataTable); var symbols = codeBook.Apply(dataTable); var inputs = symbols.ToJagged <int>(datasetRepresentation.Words.Select(x => x).ToArray()); var outputs = symbols.ToArray <int>("Topic"); var id3LearningForDataset = GetId3LearningForDataset(datasetRepresentation); decisionTree = id3LearningForDataset.Learn(inputs, outputs); input = new string[datasetRepresentation.Words.Count, 2]; for (var index = 0; index < datasetRepresentation.Words.Count; index++) { var datasetRepresentationWord = datasetRepresentation.Words[index]; input[index, 0] = datasetRepresentationWord; input[index, 1] = "2"; } }