public SparseVectorClassification AsClassification(StringTableBuilder stringTable) { var weightedIndex = new List <WeightedIndex>(); foreach (var item in Keyword) { weightedIndex.Add(new WeightedIndex { Index = stringTable.GetIndex(item), Weight = 1f }); } foreach (var item in Topic) { weightedIndex.Add(new WeightedIndex { Index = stringTable.GetIndex(item), Weight = 1f }); } return(new SparseVectorClassification { Name = Title, Data = weightedIndex .GroupBy(d => d.Index) .Select(g => new WeightedIndex { Index = g.Key, Weight = g.Sum(d => d.Weight) }) .ToArray() }); }
private static StringTableBuilder CreateComplexTableBuilderWithMixedNewLines() { var table = new StringTableBuilder(); table.AddColumn("col1"); table.AddColumn("col2"); table.AddColumn("col3"); var row = table.AddRow(); row.SetCell("col1", "foo"); row.SetCell("col2", "foobarbaz"); row.SetCell("col3", "this is a\r\nmulti-line\nstring"); row = table.AddRow(); row.SetCell("col1", "foobar"); row.SetCell("col2", "e\r\nee\neee\r\ne"); row.SetCell("col3", "f"); row = table.AddRow(); row.SetCell("col1", "foobarbaz"); row.SetCell("col2", "h"); row.SetCell("col3", "i"); return(table); }
public void TestNNMF() { var stringTableBuilder = new StringTableBuilder(); var data = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder).ConvertToSparseVectors(false).Vectorise(true).ToDictionary(d => _lap.Create(d.Data), d => d.Classification); var clusters = data.Select(d => d.Key).ToList().NNMF(_lap, 2); var clusterLabels = clusters.Select(d => d.Select(d2 => data[d2]).ToArray()).ToList(); }
private static StringTableBuilder CreateComplexTableBuilderHAlignMixedColumns() { var table = new StringTableBuilder(); table.AddColumn("col1").SetHAlignRight(); table.AddColumn("col2").SetHAlignRight(); table.AddColumn("col3").SetHAlignRight(); var row = table.AddRow(); row.SetCell("col1", "foo"); row.SetCell("col2", "foobarbaz"); row.SetCell("col3", "this is a\nmulti-line\nstring"); row = table.AddRow().SetHAlignLeft(); row.SetCell("col1", "foobar"); row.SetCell("col2", "e\nee\neee\ne").SetHAlignRight(); row.SetCell("col3", "f"); row = table.AddRow(); row.SetCell("col1", "foobarbaz"); row.SetCell("col2", "h"); row.SetCell("col3", "i"); return(table); }
public (string Classification, WeightedIndexList Data) AsClassification(StringTableBuilder stringTable) { var weightedIndex = new List <WeightedIndexList.WeightedIndex>(); foreach (var item in Keyword) { weightedIndex.Add(new WeightedIndexList.WeightedIndex { Index = stringTable.GetIndex(item), Weight = 1f }); } foreach (var item in Topic) { weightedIndex.Add(new WeightedIndexList.WeightedIndex { Index = stringTable.GetIndex(item), Weight = 1f }); } return(Title, new WeightedIndexList { IndexList = weightedIndex .GroupBy(d => d.Index) .Select(g => new WeightedIndexList.WeightedIndex { Index = g.Key, Weight = g.Sum(d => d.Weight) }) .ToArray() }); }
public void TestTFIDF() { var stringTableBuilder = new StringTableBuilder(); var bag = new ClassificationBag { Classification = new[] { Tuple.Create(new[] { "Chinese", "Beijing", "Chinese" }, true), Tuple.Create(new[] { "Chinese", "Chinese", "Shanghai" }, true), Tuple.Create(new[] { "Chinese", "Macao" }, true), Tuple.Create(new[] { "Tokyo", "Japan", "Chinese" }, false), }.Select(d => new IndexedClassification { Name = d.Item2 ? "china" : "japan", Data = d.Item1.Select(s => stringTableBuilder.GetIndex(s)).ToArray() }).ToArray() }; Assert.AreEqual(bag.Classification.Length, 4); Assert.AreEqual(bag.Classification[0].Data.Length, 3); var set = bag.ConvertToSparseVectors(true); Assert.AreEqual(set.Classification.Length, 2); Assert.AreEqual(set.Classification[0].Data.Length, 4); var tfidf = set.TFIDF(); Assert.AreEqual(tfidf.Classification.Length, 2); Assert.AreEqual(tfidf.Classification[0].Data.Length, 4); }
private static StringTableBuilder CreateSimpleTableBuilder() { var table = new StringTableBuilder(); table.AddColumn("col1"); table.AddColumn("col2"); table.AddColumn("col3"); var row = table.AddRow(); row.SetCell("col1", "a"); row.SetCell("col2", "b"); row.SetCell("col3", "c"); row = table.AddRow().SetHAlignLeft(); row.SetCell("col1", "d"); row.SetCell("col2", "e"); row.SetCell("col3", "f"); row = table.AddRow(); row.SetCell("col1", "g"); row.SetCell("col2", "h"); row.SetCell("col3", "i"); return(table); }
public void AddEmptyRow() { var table = new StringTableBuilder(); table.AddColumn("col1"); table.AddColumn("col2"); table.AddColumn("col3"); var row = table.AddRow(); row.SetCell("col1", "a"); row.SetCell("col2", "b"); row.SetCell("col3", "c"); table.AddEmptyRow(); row = table.AddRow(); row.SetCell("col1", "d"); row.SetCell("col2", "e"); row.SetCell("col3", "f"); var output = table.ToString(); Console.WriteLine(output); var expectedOutput = "abc" + s_nl + " " + s_nl + "def"; Assert.Equal(expectedOutput, output); }
public void Write(UnityBinaryWriter writer) { // Skip header since strtable_length is unknown int header_position = writer.Position; writer.Position += 8; StringTableBuilder strtable = new StringTableBuilder(); // Write Nodes for (int i = 0; i < Nodes.Length; i++) { writer.WriteUShort(Nodes[i].Version); writer.WriteByte(Nodes[i].Level); writer.WriteByte((byte)(Nodes[i].IsArray ? 1 : 0)); // Write TypeName int TypeNameOffset = GetCommonStringID(Nodes[i].Type); if (TypeNameOffset == -1) // Not a common string { writer.WriteUShort(strtable.AddString(Nodes[i].Type)); writer.WriteUShort(0); } else { writer.WriteUShort((ushort)TypeNameOffset); writer.WriteUShort(0x8000); } // Write Name int NameOffset = GetCommonStringID(Nodes[i].Name); if (NameOffset == -1) // Not a common string { writer.WriteUShort(strtable.AddString(Nodes[i].Name)); writer.WriteUShort(0); } else { writer.WriteUShort((ushort)NameOffset); writer.WriteUShort(0x8000); } writer.WriteInt(Nodes[i].ByteSize); writer.WriteInt(Nodes[i].Index); writer.WriteInt(Nodes[i].MetaFlag); } // Write StringTable byte[] strtable_bytes = strtable.ToBytes(); writer.WriteBytes(strtable_bytes); // Write node_count and strtable_length int final_pos = writer.Position; writer.Position = header_position; writer.WriteInt(Nodes.Length); writer.WriteInt(strtable_bytes.Length); writer.Position = final_pos; }
public void EmptyTable() { var table = new StringTableBuilder(); var output = table.ToString(); Console.WriteLine(output); const string expectedOutput = ""; Assert.Equal(expectedOutput, output); }
public void TestBernoulliNaiveBayes() { var stringTableBuilder = new StringTableBuilder(); var data = GetSimpleChineseSet(stringTableBuilder); var model = data.TrainBernoulliNaiveBayes(); var classifier = model.CreateClassifier(); var classification = classifier.Classify(GetTestRow(stringTableBuilder)); Assert.IsTrue(classification.First() == "japan"); }
public void TestDecisionTree() { var stringTableBuilder = new StringTableBuilder(); var data = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder).ConvertToSparseVectors(false).ConvertToTable(); var model = data.TrainDecisionTree(); var classifier = model.CreateClassifier(); var testRows = data.GetRows(new[] { 0, data.RowCount - 1 }); Assert.IsTrue(classifier.Classify(testRows[0]).First() == "china"); Assert.IsTrue(classifier.Classify(testRows[1]).First() == "japan"); }
public void TestRandomForest() { var stringTableBuilder = new StringTableBuilder(); var data = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder).ConvertToWeightedIndexList(false).ConvertToTable(); var model = data.TrainRandomForest(); var classifier = model.CreateClassifier(); var testRows = data.GetRows(new[] { 0, data.RowCount - 1 }); Assert.IsTrue(classifier.Classify(testRows[0]).GetBestClassification() == "china"); //Assert.IsTrue(classifier.Classify(testRows[1]).First() == "japan"); }
private static string UsageParameters(Command command, string commandName) { var builder = new StringBuilder(); builder.AppendLine(commandName == null ? "Parameters:" : $"{commandName} parameters:"); var paramNames = command.Parameters.Keys.OrderBy(s => s).ToList(); var table = new StringTableBuilder(); table.AddColumn("space0"); table.AddColumn("name"); table.AddColumn("space1"); table.AddColumn("alias"); table.AddColumn("space2"); table.AddColumn("default-value"); table.AddColumn("space3"); table.AddColumn("type"); table.AddColumn("space4"); table.AddColumn("description"); var headerRow = table.AddRow(); headerRow.SetCell("space0", " "); headerRow.SetCell("name", "name"); headerRow.SetCell("space1", " "); headerRow.SetCell("alias", "alias"); headerRow.SetCell("space2", " "); headerRow.SetCell("default-value", "default-value"); headerRow.SetCell("space3", " "); headerRow.SetCell("type", "type"); headerRow.SetCell("space4", " "); headerRow.SetCell("description", "description"); foreach (var paramName in paramNames) { var param = command.Parameters[paramName]; var row = table.AddRow(); row.SetCell("space0", " "); row.SetCell("name", $"--{param.Name}"); row.SetCell("space1", " "); row.SetCell("alias", $"{AliasKey(param)}"); row.SetCell("space2", " "); row.SetCell("default-value", param.Optional ? $"{param.DefaultValue}" : ""); row.SetCell("space3", " "); row.SetCell("type", $"{param.Type.Name}"); row.SetCell("space4", " "); row.SetCell("description", $"{param.Description}"); } builder.Append(table.ToString()); return(builder.ToString()); }
public static ClassificationBag GetSimpleChineseSet(StringTableBuilder stringTableBuilder) { // sample data from: http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html return(new ClassificationBag { Classification = new[] { Tuple.Create(new[] { "Chinese", "Beijing", "Chinese" }, true), Tuple.Create(new[] { "Chinese", "Chinese", "Shanghai" }, true), Tuple.Create(new[] { "Chinese", "Macao" }, true), Tuple.Create(new[] { "Tokyo", "Japan", "Chinese" }, false), }.Select(d => new IndexedClassification { Name = d.Item2 ? "china" : "japan", Data = d.Item1.Select(s => stringTableBuilder.GetIndex(s)).ToArray() }).ToArray() }); }
public void TestTFIDF() { var stringTableBuilder = new StringTableBuilder(); var data = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder); Assert.AreEqual(data.Count, 4); Assert.AreEqual(data.First().Data.Count, 3); var set = data.ConvertToWeightedIndexList(true); Assert.AreEqual(set.Count, 2); Assert.AreEqual(set.First().Data.Count, 4); var tfidf = set.TFIDF(); Assert.AreEqual(tfidf.Count, 2); Assert.AreEqual(tfidf.First().Data.Count, 4); }
private string UsageCommands(string[] commandNames) { var table = new StringTableBuilder(); table.AddColumn("col1"); table.AddColumn("col2"); foreach (var commandName in commandNames) { var command = _commands[commandName]; var row = table.AddRow(); row.SetCell("col1", $" {command.Name}"); row.SetCell("col2", $" {command.Description}"); } return(table.ToString()); }
public void TestKMeans() { var stringTableBuilder = new StringTableBuilder(); var data = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder) .ConvertToWeightedIndexList(false) .Vectorise() .ToDictionary(d => _lap.CreateVector(d.Data), d => d.Classification) ; var clusters = data .Select(d => d.Key) .ToList() .KMeans(2) ; var clusterLabels = clusters .Select(d => d.Select(d2 => data[d2]).ToArray()) .ToList() ; }
private static string UsageFooter() { var table = new StringTableBuilder().SetInnerCellColumnPadding(1); table.AddColumn("col1"); table.AddColumn("col2"); table.AddColumn("col3"); var row = table.AddRow(); row.SetCell("col1", "<parameters>"); row.SetCell("col2", ":="); row.SetCell("col3", "<p1-key> <p1-value> ... <pN-key> <pN-value>"); row = table.AddRow(); row.SetCell("col1", "<p-key>"); row.SetCell("col2", ":="); row.SetCell("col3", "-<p-alias>|--<p-name>"); return(table.ToString()); }
/// <summary> /// Classifies text into either positive or negative sentiment /// The data files can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences /// </summary> /// <param name="dataFilesPath">Path to extracted data files</param> public static void SentimentClassification(string dataFilesPath) { var files = new[] { "amazon_cells_labelled.txt", "imdb_labelled.txt", "yelp_labelled.txt" }; var LINE_SEPARATOR = "\n".ToCharArray(); var SEPARATOR = "\t".ToCharArray(); var stringTable = new StringTableBuilder(); var sentimentData = files.SelectMany(f => File.ReadAllText(dataFilesPath + f) .Split(LINE_SEPARATOR) .Where(l => !String.IsNullOrWhiteSpace(l)) .Select(l => l.Split(SEPARATOR)) .Select(s => Tuple.Create(_Tokenise(s[0]), s[1][0] == '1' ? "positive" : "negative")) .Where(d => d.Item1.Any()) ).Shuffle(0).ToList(); var splitSentimentData = sentimentData.Split(); // build training and test classification bag var trainingClassificationBag = _BuildClassificationBag(splitSentimentData.Training, stringTable); var testClassificationBag = _BuildClassificationBag(splitSentimentData.Test, stringTable); // train a bernoulli naive bayes classifier var bernoulli = trainingClassificationBag.TrainBernoulliNaiveBayes(); Console.WriteLine("Bernoulli accuracy: {0:P}", testClassificationBag .Classify(bernoulli.CreateClassifier()) .Average(r => r.Score) ); // train a multinomial naive bayes classifier var multinomial = trainingClassificationBag.TrainMultinomialNaiveBayes(); Console.WriteLine("Multinomial accuracy: {0:P}", testClassificationBag .Classify(multinomial.CreateClassifier()) .Average(r => r.Score) ); // convert the bags to sparse vectors var sentimentDataBag = _BuildClassificationBag(sentimentData, stringTable); var sentimentDataSet = sentimentDataBag.ConvertToSparseVectors(false); var sentimentDataTableSplit = sentimentDataSet.Split(); using (var lap = GPUProvider.CreateLinearAlgebra(false)) { var maxIndex = sentimentDataSet.GetMaximumIndex() + 1; var trainingData = sentimentDataTableSplit.Training.CreateTrainingDataProvider(lap, maxIndex); var testData = sentimentDataTableSplit.Test.CreateTrainingDataProvider(lap, maxIndex); var classificationTable = sentimentDataSet.GetClassifications().ToDictionary(d => (int)d.Value, d => d.Key); // create the three classifiers var bernoulliClassifier = bernoulli.CreateClassifier(); var multinomialClassifier = multinomial.CreateClassifier(); var neuralClassifier = lap.NN.CreateFeedForward(lap.NN.CreateTrainingContext(ErrorMetricType.OneHot, learningRate: 0.1f, batchSize: 128) .TrainNeuralNetwork(lap, trainingData, testData, new LayerDescriptor(0.1f) { WeightUpdate = WeightUpdateType.Adam, Activation = ActivationType.Relu, WeightInitialisation = WeightInitialisationType.Xavier, LayerTrainer = LayerTrainerType.Dropout }, hiddenLayerSize: 512, numEpochs: 10) ); // create the stacked training set Console.WriteLine("Creating model stack data set..."); var modelStacker = new ModelStacker(); foreach (var item in sentimentDataSet.Classification) { var indexList = item.GetIndexList(); modelStacker.Add(new[] { bernoulliClassifier.GetWeightedClassifications(indexList), multinomialClassifier.GetWeightedClassifications(indexList), neuralClassifier.GetWeightedClassifications(item.Vectorise(maxIndex), classificationTable) }, item.Name); } // convert the stacked data to a data table and split it into training and test sets var sentimentDataTable = modelStacker.GetTable(); var dataTableVectoriser = sentimentDataTable.GetVectoriser(); var split = sentimentDataTable.Split(); var trainingStack = lap.NN.CreateTrainingDataProvider(split.Training, dataTableVectoriser); var testStack = lap.NN.CreateTrainingDataProvider(split.Test, dataTableVectoriser); var targetColumnIndex = sentimentDataTable.TargetColumnIndex; // train a neural network on the stacked data var trainingContext = lap.NN.CreateTrainingContext(ErrorMetricType.OneHot, learningRate: 0.3f, batchSize: 8); trainingContext.ScheduleTrainingRateChange(10, 0.1f); var stackNN = lap.NN.CreateFeedForward(trainingContext.TrainNeuralNetwork(lap, trainingStack, testStack, new LayerDescriptor(0.1f) { WeightUpdate = WeightUpdateType.RMSprop, Activation = ActivationType.LeakyRelu, WeightInitialisation = WeightInitialisationType.Xavier }, hiddenLayerSize: 32, numEpochs: 20)); uint stringIndex; Console.WriteLine("Enter some text to test the classifiers..."); while (true) { Console.Write(">"); var line = Console.ReadLine(); if (String.IsNullOrWhiteSpace(line)) { break; } var tokens = _Tokenise(line); var indexList = new List <uint>(); foreach (var token in tokens) { if (stringTable.TryGetIndex(token, out stringIndex)) { indexList.Add(stringIndex); } } if (indexList.Any()) { var queryTokens = indexList.GroupBy(d => d).Select(g => Tuple.Create(g.Key, (float)g.Count())).ToList(); var vector = new float[maxIndex]; foreach (var token in queryTokens) { vector[token.Item1] = token.Item2; } Console.WriteLine("Bernoulli classification: " + bernoulliClassifier.Classify(indexList).First()); Console.WriteLine("Multinomial classification: " + multinomialClassifier.Classify(indexList).First()); Console.WriteLine("Neural network classification: " + classificationTable[neuralClassifier.Execute(vector).MaximumIndex()]); var stackInput = modelStacker.Vectorise(new[] { bernoulliClassifier.GetWeightedClassifications(indexList), multinomialClassifier.GetWeightedClassifications(indexList), neuralClassifier.GetWeightedClassifications(vector, classificationTable) }); Console.WriteLine("Stack classification: " + dataTableVectoriser.GetOutputLabel(targetColumnIndex, stackNN.Execute(stackInput).MaximumIndex())); } else { Console.WriteLine("Sorry, none of those words have been seen before."); } Console.WriteLine(); } } Console.WriteLine(); }
static ClassificationBag _BuildClassificationBag(IReadOnlyList <Tuple <string[], string> > data, StringTableBuilder stringTable) { return(new ClassificationBag { Classification = data.Select(d => new IndexedClassification { Name = d.Item2, Data = d.Item1.Select(str => stringTable.GetIndex(str)).ToArray() }).ToArray() }); }
public static IReadOnlyList <(string Label, IndexList Data)> GetSimpleChineseSet(StringTableBuilder stringTableBuilder) { // sample data from: http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html var data = new[] {
void _ClusterDataset() { Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("Downloading dataset..."); }); var uri = new Uri("https://archive.ics.uci.edu/ml/machine-learning-databases/00307/%5bUCI%5d%20AAAI-14%20Accepted%20Papers%20-%20Papers.csv"); var KEYWORD_SPLIT = " \n".ToCharArray(); var TOPIC_SPLIT = "\n".ToCharArray(); // download the document list var docList = new List <AAAIDocument>(); using (var client = new WebClient()) { var data = client.DownloadData(uri); Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("Building data table..."); }); // parse the file CSV var dataTable = new StreamReader(new MemoryStream(data)).ParseCSV(','); // create strongly typed documents from the data table dataTable.ForEach(row => docList.Add(new AAAIDocument { Abstract = row.GetField <string>(5), Keyword = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(), Topic = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Group = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Title = row.GetField <string>(0) })); } // create a document lookup table var docTable = docList.ToDictionary(d => d.Title, d => d); // extract features from the document's metadata var stringTable = new StringTableBuilder(); var classificationSet = new SparseVectorClassificationSet { Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray() }; // normalise the document/t var encodings = classificationSet.Vectorise(true); // convert the sparse feature vectors into dense vectors var documentClusterList = new List <DocumentCluster>(); using (var lap = Provider.CreateLinearAlgebra()) { var lookupTable = encodings .Select(d => Tuple.Create(d, lap.Create(d.Data).AsIndexable())) .ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]) ; var vectorList = lookupTable.Select(d => d.Key).ToList(); Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("Clustering data..."); }); // cluster the dense vectors using (var nnmf = new NNMF(lap, vectorList, _clusterColour.Length)) { var clusters = nnmf.Cluster(40, cost => { Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("NNMF error: " + cost.ToString()); }); }); // create document clusters from the NNMF results int index = 0; foreach (var cluster in clusters) { var documentCluster = new List <AAAIDocument>(); foreach (var item in cluster) { var document = lookupTable[item]; documentCluster.Add(document); } var desc = String.Join(", ", nnmf.GetRankedFeatures(index++) .Select(i => stringTable.GetString(i)) .Take(32) ); documentClusterList.Add(new DocumentCluster(documentCluster, desc)); } // collect the cluster membership for each document for (int i = 0, len = vectorList.Count; i < len; i++) { lookupTable[vectorList[i]].ClusterMembership = nnmf.GetClusterMembership(i); } } } Dispatcher.Invoke((SimpleDelegate) delegate() { _UpdateUI(documentClusterList); }); }
/// <summary> /// Classifies text into either positive or negative sentiment /// The data files can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences /// </summary> /// <param name="dataFilesPath">Path to extracted data files</param> public static void SentimentClassification(string dataFilesPath) { var files = new[] { "amazon_cells_labelled.txt", "imdb_labelled.txt", "yelp_labelled.txt" }; var LINE_SEPARATOR = "\n".ToCharArray(); var SEPARATOR = "\t".ToCharArray(); var stringTable = new StringTableBuilder(); var sentimentData = files.SelectMany(f => File.ReadAllText(dataFilesPath + f).Split(LINE_SEPARATOR). Where(l => !string.IsNullOrWhiteSpace(l)).Select(l => l.Split(SEPARATOR)). Select(s => Tuple.Create(Tokenise(s[0]), s[1][0] == '1' ? "positive" : "negative")). Where(d => d.Item1.Any())).Shuffle(0).ToList(); var splitSentimentData = sentimentData.Split(); // build training and test classification bag var trainingClassificationBag = BuildIndexedClassifications(splitSentimentData.Training, stringTable); var testClassificationBag = BuildIndexedClassifications(splitSentimentData.Test, stringTable); // train a bernoulli naive bayes classifier var bernoulli = trainingClassificationBag.TrainBernoulliNaiveBayes(); Console.WriteLine("Bernoulli accuracy: {0:P}", testClassificationBag.Classify(bernoulli.CreateClassifier()).Average(r => r.Score)); // train a multinomial naive bayes classifier var multinomial = trainingClassificationBag.TrainMultinomialNaiveBayes(); Console.WriteLine("Multinomial accuracy: {0:P}", testClassificationBag.Classify(multinomial.CreateClassifier()).Average(r => r.Score)); // convert the index lists to vectors and normalise along the way var sentimentDataTable = BuildIndexedClassifications(sentimentData, stringTable). ConvertToTable().Normalise(NormalisationType.Standard); var vectoriser = sentimentDataTable.GetVectoriser(); var sentimentDataSet = sentimentDataTable.Split(0); var dataTableAnalysis = sentimentDataTable.GetAnalysis(); using (var lap = BrightWireProvider.CreateLinearAlgebra()) { var graph = new GraphFactory(lap); var trainingData = graph.CreateDataSource(sentimentDataSet.Training, vectoriser); var testData = graph.CreateDataSource(sentimentDataSet.Test, vectoriser); var indexListEncoder = (IIndexListEncoder)trainingData; // use a one hot encoding error metric, rmsprop gradient descent and xavier weight initialisation var errorMetric = graph.ErrorMetric.OneHotEncoding; var propertySet = graph.CurrentPropertySet.Use(graph.GradientDescent.RmsProp). Use(graph.WeightInitialisation.Xavier); var engine = graph.CreateTrainingEngine(trainingData, 0.3f); engine.LearningContext.ScheduleLearningRate(5, 0.1f); engine.LearningContext.ScheduleLearningRate(11, 1f); engine.LearningContext.ScheduleLearningRate(15, 0.3f); // train a neural network classifier var neuralNetworkWire = graph.Connect(engine).AddFeedForward(512, "layer1") //.AddBatchNormalisation() .Add(graph.ReluActivation()).AddDropOut(0.5f). AddFeedForward(trainingData.OutputSize, "layer2").Add(graph.ReluActivation()). AddBackpropagation(errorMetric, "first-network"); // train the network Console.WriteLine("Training neural network classifier..."); const int TRAINING_ITERATIONS = 10; GraphModel bestNetwork = null; engine.Train(TRAINING_ITERATIONS, testData, errorMetric, network => bestNetwork = network); if (bestNetwork != null) { engine.LoadParametersFrom(bestNetwork.Graph); } var firstClassifier = graph.CreateEngine(engine.Graph); // stop the backpropagation to the first neural network engine.LearningContext.EnableNodeUpdates(neuralNetworkWire.Find("layer1"), false); engine.LearningContext.EnableNodeUpdates(neuralNetworkWire.Find("layer2"), false); // create the bernoulli classifier wire var bernoulliClassifier = bernoulli.CreateClassifier(); var bernoulliWire = graph.Connect(engine).AddClassifier(bernoulliClassifier, sentimentDataSet.Training, dataTableAnalysis); // create the multinomial classifier wire var multinomialClassifier = multinomial.CreateClassifier(); var multinomialWire = graph.Connect(engine).AddClassifier(multinomialClassifier, sentimentDataSet.Training, dataTableAnalysis); // join the bernoulli, multinomial and neural network classification outputs var firstNetwork = neuralNetworkWire.Find("first-network"); var joined = graph.Join(multinomialWire, graph.Join(bernoulliWire, graph.Connect(trainingData.OutputSize, firstNetwork))); // train an additional classifier on the output of the previous three classifiers joined.AddFeedForward(outputSize: 64).Add(graph.ReluActivation()). AddDropOut(dropOutPercentage: 0.5f).AddFeedForward(trainingData.OutputSize). Add(graph.ReluActivation()).AddBackpropagation(errorMetric); // train the network again Console.WriteLine("Training stacked neural network classifier..."); GraphModel bestStackedNetwork = null; engine.Train(10, testData, errorMetric, network => bestStackedNetwork = network); if (bestStackedNetwork != null) { engine.LoadParametersFrom(bestStackedNetwork.Graph); } Console.WriteLine("Enter some text to test the classifiers..."); while (true) { Console.Write(">"); var line = Console.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { break; } var tokens = Tokenise(line); var indexList = new List <uint>(); foreach (var token in tokens) { if (stringTable.TryGetIndex(token, out uint stringIndex)) { indexList.Add(stringIndex); } } if (indexList.Any()) { var queryTokens = indexList.GroupBy(d => d). Select(g => Tuple.Create(g.Key, (float)g.Count())).ToList(); var vector = new float[trainingData.InputSize]; foreach (var token in queryTokens) { vector[token.Item1] = token.Item2; } var indexList2 = IndexList.Create(indexList.ToArray()); var encodedInput = indexListEncoder.Encode(indexList2); Console.WriteLine("Bernoulli classification: " + bernoulliClassifier.Classify(indexList2).First().Label); Console.WriteLine("Multinomial classification: " + multinomialClassifier.Classify(indexList2).First().Label); var result = firstClassifier.Execute(encodedInput); var classification = vectoriser.GetOutputLabel(1, (result.Output[0].Data[0] > result.Output[0].Data[1]) ? 0 : 1); Console.WriteLine("Neural network classification: " + classification); var stackedResult = engine.Execute(encodedInput); var stackedClassification = vectoriser.GetOutputLabel(1, (stackedResult.Output[0].Data[0] > stackedResult.Output[0].Data[1]) ? 0 : 1); Console.WriteLine("Stack classification: " + stackedClassification); } else { Console.WriteLine("Sorry, none of those words have been seen before."); } Console.WriteLine(); } } Console.WriteLine(); }
private static IReadOnlyList <(string Classification, IndexList Data)> BuildIndexedClassifications( IReadOnlyList <Tuple <string[], string> > data, StringTableBuilder stringTable) { return(data.Select(d => (d.Item2, IndexList.Create(d.Item1.Select(str => stringTable.GetIndex(str)).ToArray()))).ToList()); }
void _AnalyseDataset() { Dispatcher.Invoke(() => { _statusMessage.Add("Downloading dataset..."); }); var uri = new Uri("https://archive.ics.uci.edu/ml/machine-learning-databases/00307/%5bUCI%5d%20AAAI-14%20Accepted%20Papers%20-%20Papers.csv"); var KEYWORD_SPLIT = " \n".ToCharArray(); var TOPIC_SPLIT = "\n".ToCharArray(); // download the document list var docList = new List <AAAIDocument>(); using (var client = new WebClient()) { var data = client.DownloadData(uri); Dispatcher.Invoke(() => { _statusMessage.Add("Building data table..."); }); // parse the file CSV var dataTable = new StreamReader(new MemoryStream(data)).ParseCSV(','); // create strongly typed documents from the data table dataTable.ForEach(row => docList.Add(new AAAIDocument { Abstract = row.GetField <string>(5), Keyword = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(), Topic = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Group = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Title = row.GetField <string>(0) })); } // create a document lookup table var docTable = docList.ToDictionary(d => d.Title, d => d); // extract features from the document's metadata var stringTable = new StringTableBuilder(); var classificationSet = new SparseVectorClassificationSet { Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray() }; // create dense feature vectors and normalise�along the way var encodings = classificationSet.Vectorise(true); using (var lap = Provider.CreateLinearAlgebra()) { var lookupTable = encodings.Select(d => Tuple.Create(d, lap.Create(d.Data))).ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]); var vectorList = lookupTable.Select(d => d.Key).ToList(); // create a term/document matrix with terms as columns and documents as rows var matrix = lap.CreateMatrix(vectorList.Select(d => d.Data).ToList()); Dispatcher.Invoke(() => { _statusMessage.Add("Performing latent semantic analysis..."); }); // compute the SVD const int K = 3; var kIndices = Enumerable.Range(0, K).ToList(); var matrixT = matrix.Transpose(); var svd = matrixT.Svd(); // create latent space var s = lap.CreateDiagonal(svd.S.AsIndexable().Values.Take(K).ToList()); var v2 = svd.VT.GetNewMatrixFromRows(kIndices); using (var sv2 = s.Multiply(v2)) { var vectorList2 = sv2.AsIndexable().Columns.ToList(); var lookupTable2 = vectorList2.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]); // cluster the latent space var clusters = vectorList2.KMeans(COLOUR_LIST.Length); var clusterTable = clusters .Select((l, i) => Tuple.Create(l, i)) .SelectMany(d => d.Item1.Select(v => Tuple.Create(v, d.Item2))) .ToDictionary(d => d.Item1, d => COLOUR_LIST[d.Item2]) ; // build the document list var documentList = new List <Document>(); int index = 0; double maxX = double.MinValue, minX = double.MaxValue, maxY = double.MinValue, minY = double.MaxValue, maxZ = double.MinValue, minZ = double.MaxValue; foreach (var item in vectorList2) { float x = item[0]; float y = item[1]; float z = item[2]; documentList.Add(new Document(x, y, z, index++, lookupTable2[item], clusterTable[item])); if (x > maxX) { maxX = x; } if (x < minX) { minX = x; } if (y > maxY) { maxY = y; } if (y < minY) { minY = y; } if (z > maxZ) { maxZ = z; } if (z < minZ) { minZ = z; } } double rangeX = maxX - minX; double rangeY = maxY - minY; double rangeZ = maxZ - minZ; foreach (var document in documentList) { document.Normalise(minX, rangeX, minY, rangeY, minZ, rangeZ); } Dispatcher.Invoke(() => { var numDocs = documentList.Count; _cube = new Cube[numDocs]; _searchResult = new SearchResult[numDocs]; _statusMessage.Add("Creating 3D graph..."); var SCALE = 10; for (var i = 0; i < numDocs; i++) { var document = documentList[i]; var cube = _cube[i] = new Cube(SCALE * document.X, SCALE * document.Y, SCALE * document.Z, i); var searchResult = _searchResult[i] = new SearchResult(document.AAAIDocument, i); cube.Colour = document.Colour; searchResult.Colour = document.Colour; searchResult.MouseHoverEvent += new SearchResult.MouseHoverDelegate(searchResult_MouseHoverEvent); viewPort.Children.Add(cube); } foreach (var item in _searchResult.OrderBy(sr => sr.Colour.GetHashCode())) { panelResults.Children.Add(item); } icStatus.Visibility = Visibility.Collapsed; viewPort.Visibility = Visibility.Visible; progress.Visibility = Visibility.Collapsed; }); } } }
/// <summary> /// Cluster a tagged set of documents /// Can be downloaded from https://archive.ics.uci.edu/ml/machine-learning-databases/00307/ /// </summary> /// <param name="dataFilePath">The path to the data file</param> /// <param name="outputPath">A directory to write the output files to</param> public static void TextClustering(string dataFilePath, string outputPath) { IDataTable dataTable; using (var reader = new StreamReader(dataFilePath)) { dataTable = reader.ParseCSV(); } var KEYWORD_SPLIT = " \n".ToCharArray(); var TOPIC_SPLIT = "\n".ToCharArray(); var docList = new List <AAAIDocument>(); dataTable.ForEach(row => docList.Add(new AAAIDocument { Abstract = row.GetField <string>(5), Keyword = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(), Topic = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Group = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Title = row.GetField <string>(0) })); var docTable = docList.ToDictionary(d => d.Title, d => d); var allGroups = new HashSet <string>(docList.SelectMany(d => d.Group)); var stringTable = new StringTableBuilder(); var classificationSet = new SparseVectorClassificationSet { Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray() }; var encodings = classificationSet.Vectorise(true); using (var lap = Provider.CreateLinearAlgebra()) { var lookupTable = encodings.Select(d => Tuple.Create(d, lap.Create(d.Data))).ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]); var vectorList = lookupTable.Select(d => d.Key).ToList(); Console.WriteLine("Kmeans clustering..."); _WriteClusters(outputPath + "kmeans.txt", vectorList.KMeans(allGroups.Count), lookupTable); Console.WriteLine("NNMF clustering..."); _WriteClusters(outputPath + "nnmf.txt", vectorList.NNMF(lap, allGroups.Count, 100), lookupTable); // create a term/document matrix with terms as columns and documents as rows var matrix = lap.CreateMatrix(vectorList.Select(v => v.Data).ToList()); vectorList.ForEach(v => v.Dispose()); Console.WriteLine("Creating random projection..."); using (var randomProjection = lap.CreateRandomProjection((int)classificationSet.GetMaximumIndex() + 1, 512)) { using (var projectedMatrix = randomProjection.Compute(matrix)) { var vectorList2 = Enumerable.Range(0, projectedMatrix.RowCount).Select(i => projectedMatrix.Row(i)).ToList(); var lookupTable2 = vectorList2.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]); Console.WriteLine("Kmeans clustering of random projection..."); _WriteClusters(outputPath + "projected-kmeans.txt", vectorList2.KMeans(allGroups.Count), lookupTable2); vectorList2.ForEach(v => v.Dispose()); } } Console.WriteLine("Building latent term/document space..."); const int K = 256; var kIndices = Enumerable.Range(0, K).ToList(); var matrixT = matrix.Transpose(); matrix.Dispose(); var svd = matrixT.Svd(); matrixT.Dispose(); var s = lap.CreateDiagonal(svd.S.AsIndexable().Values.Take(K).ToList()); var v2 = svd.VT.GetNewMatrixFromRows(kIndices); svd.Dispose(); using (var sv2 = s.Multiply(v2)) { v2.Dispose(); s.Dispose(); var vectorList3 = sv2.AsIndexable().Columns.ToList(); var lookupTable3 = vectorList3.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]); Console.WriteLine("Kmeans clustering in latent document space..."); _WriteClusters(outputPath + "latent-kmeans.txt", vectorList3.KMeans(allGroups.Count), lookupTable3); } } }
public static IReadOnlyList <uint> GetTestRow(StringTableBuilder stringTableBuilder) { return(new[] { "Chinese", "Chinese", "Chinese", "Tokyo", "Japan" }.Select(s => stringTableBuilder.GetIndex(s)).ToArray()); }