Ejemplo n.º 1
0
            public SparseVectorClassification AsClassification(StringTableBuilder stringTable)
            {
                var weightedIndex = new List <WeightedIndex>();

                foreach (var item in Keyword)
                {
                    weightedIndex.Add(new WeightedIndex {
                        Index  = stringTable.GetIndex(item),
                        Weight = 1f
                    });
                }
                foreach (var item in Topic)
                {
                    weightedIndex.Add(new WeightedIndex {
                        Index  = stringTable.GetIndex(item),
                        Weight = 1f
                    });
                }
                return(new SparseVectorClassification {
                    Name = Title,
                    Data = weightedIndex
                           .GroupBy(d => d.Index)
                           .Select(g => new WeightedIndex {
                        Index = g.Key,
                        Weight = g.Sum(d => d.Weight)
                    })
                           .ToArray()
                });
            }
        private static StringTableBuilder CreateComplexTableBuilderWithMixedNewLines()
        {
            var table = new StringTableBuilder();

            table.AddColumn("col1");
            table.AddColumn("col2");
            table.AddColumn("col3");

            var row = table.AddRow();

            row.SetCell("col1", "foo");
            row.SetCell("col2", "foobarbaz");
            row.SetCell("col3", "this is a\r\nmulti-line\nstring");

            row = table.AddRow();
            row.SetCell("col1", "foobar");
            row.SetCell("col2", "e\r\nee\neee\r\ne");
            row.SetCell("col3", "f");

            row = table.AddRow();
            row.SetCell("col1", "foobarbaz");
            row.SetCell("col2", "h");
            row.SetCell("col3", "i");

            return(table);
        }
Ejemplo n.º 3
0
 public void TestNNMF()
 {
     var stringTableBuilder = new StringTableBuilder();
     var data          = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder).ConvertToSparseVectors(false).Vectorise(true).ToDictionary(d => _lap.Create(d.Data), d => d.Classification);
     var clusters      = data.Select(d => d.Key).ToList().NNMF(_lap, 2);
     var clusterLabels = clusters.Select(d => d.Select(d2 => data[d2]).ToArray()).ToList();
 }
        private static StringTableBuilder CreateComplexTableBuilderHAlignMixedColumns()
        {
            var table = new StringTableBuilder();

            table.AddColumn("col1").SetHAlignRight();
            table.AddColumn("col2").SetHAlignRight();
            table.AddColumn("col3").SetHAlignRight();

            var row = table.AddRow();

            row.SetCell("col1", "foo");
            row.SetCell("col2", "foobarbaz");
            row.SetCell("col3", "this is a\nmulti-line\nstring");

            row = table.AddRow().SetHAlignLeft();
            row.SetCell("col1", "foobar");
            row.SetCell("col2", "e\nee\neee\ne").SetHAlignRight();
            row.SetCell("col3", "f");

            row = table.AddRow();
            row.SetCell("col1", "foobarbaz");
            row.SetCell("col2", "h");
            row.SetCell("col3", "i");

            return(table);
        }
Ejemplo n.º 5
0
            public (string Classification, WeightedIndexList Data) AsClassification(StringTableBuilder stringTable)
            {
                var weightedIndex = new List <WeightedIndexList.WeightedIndex>();

                foreach (var item in Keyword)
                {
                    weightedIndex.Add(new WeightedIndexList.WeightedIndex {
                        Index  = stringTable.GetIndex(item),
                        Weight = 1f
                    });
                }
                foreach (var item in Topic)
                {
                    weightedIndex.Add(new WeightedIndexList.WeightedIndex {
                        Index  = stringTable.GetIndex(item),
                        Weight = 1f
                    });
                }
                return(Title, new WeightedIndexList {
                    IndexList = weightedIndex
                                .GroupBy(d => d.Index)
                                .Select(g => new WeightedIndexList.WeightedIndex {
                        Index = g.Key,
                        Weight = g.Sum(d => d.Weight)
                    })
                                .ToArray()
                });
            }
Ejemplo n.º 6
0
        public void TestTFIDF()
        {
            var stringTableBuilder = new StringTableBuilder();
            var bag = new ClassificationBag {
                Classification = new[] {
                    Tuple.Create(new[] { "Chinese", "Beijing", "Chinese" }, true),
                    Tuple.Create(new[] { "Chinese", "Chinese", "Shanghai" }, true),
                    Tuple.Create(new[] { "Chinese", "Macao" }, true),
                    Tuple.Create(new[] { "Tokyo", "Japan", "Chinese" }, false),
                }.Select(d => new IndexedClassification {
                    Name = d.Item2 ? "china" : "japan",
                    Data = d.Item1.Select(s => stringTableBuilder.GetIndex(s)).ToArray()
                }).ToArray()
            };

            Assert.AreEqual(bag.Classification.Length, 4);
            Assert.AreEqual(bag.Classification[0].Data.Length, 3);
            var set = bag.ConvertToSparseVectors(true);

            Assert.AreEqual(set.Classification.Length, 2);
            Assert.AreEqual(set.Classification[0].Data.Length, 4);

            var tfidf = set.TFIDF();

            Assert.AreEqual(tfidf.Classification.Length, 2);
            Assert.AreEqual(tfidf.Classification[0].Data.Length, 4);
        }
        private static StringTableBuilder CreateSimpleTableBuilder()
        {
            var table = new StringTableBuilder();

            table.AddColumn("col1");
            table.AddColumn("col2");
            table.AddColumn("col3");

            var row = table.AddRow();

            row.SetCell("col1", "a");
            row.SetCell("col2", "b");
            row.SetCell("col3", "c");

            row = table.AddRow().SetHAlignLeft();
            row.SetCell("col1", "d");
            row.SetCell("col2", "e");
            row.SetCell("col3", "f");

            row = table.AddRow();
            row.SetCell("col1", "g");
            row.SetCell("col2", "h");
            row.SetCell("col3", "i");

            return(table);
        }
        public void AddEmptyRow()
        {
            var table = new StringTableBuilder();

            table.AddColumn("col1");
            table.AddColumn("col2");
            table.AddColumn("col3");

            var row = table.AddRow();

            row.SetCell("col1", "a");
            row.SetCell("col2", "b");
            row.SetCell("col3", "c");

            table.AddEmptyRow();

            row = table.AddRow();
            row.SetCell("col1", "d");
            row.SetCell("col2", "e");
            row.SetCell("col3", "f");

            var output = table.ToString();

            Console.WriteLine(output);
            var expectedOutput =
                "abc" + s_nl +
                "   " + s_nl +
                "def";

            Assert.Equal(expectedOutput, output);
        }
Ejemplo n.º 9
0
        public void Write(UnityBinaryWriter writer)
        {
            // Skip header since strtable_length is unknown
            int header_position = writer.Position;

            writer.Position += 8;

            StringTableBuilder strtable = new StringTableBuilder();

            // Write Nodes
            for (int i = 0; i < Nodes.Length; i++)
            {
                writer.WriteUShort(Nodes[i].Version);
                writer.WriteByte(Nodes[i].Level);
                writer.WriteByte((byte)(Nodes[i].IsArray ? 1 : 0));

                // Write TypeName
                int TypeNameOffset = GetCommonStringID(Nodes[i].Type);
                if (TypeNameOffset == -1)  // Not a common string
                {
                    writer.WriteUShort(strtable.AddString(Nodes[i].Type));
                    writer.WriteUShort(0);
                }
                else
                {
                    writer.WriteUShort((ushort)TypeNameOffset);
                    writer.WriteUShort(0x8000);
                }

                // Write Name
                int NameOffset = GetCommonStringID(Nodes[i].Name);
                if (NameOffset == -1)   // Not a common string
                {
                    writer.WriteUShort(strtable.AddString(Nodes[i].Name));
                    writer.WriteUShort(0);
                }
                else
                {
                    writer.WriteUShort((ushort)NameOffset);
                    writer.WriteUShort(0x8000);
                }

                writer.WriteInt(Nodes[i].ByteSize);
                writer.WriteInt(Nodes[i].Index);
                writer.WriteInt(Nodes[i].MetaFlag);
            }

            // Write StringTable
            byte[] strtable_bytes = strtable.ToBytes();
            writer.WriteBytes(strtable_bytes);

            // Write node_count and strtable_length
            int final_pos = writer.Position;

            writer.Position = header_position;
            writer.WriteInt(Nodes.Length);
            writer.WriteInt(strtable_bytes.Length);
            writer.Position = final_pos;
        }
        public void EmptyTable()
        {
            var table  = new StringTableBuilder();
            var output = table.ToString();

            Console.WriteLine(output);
            const string expectedOutput = "";

            Assert.Equal(expectedOutput, output);
        }
Ejemplo n.º 11
0
        public void TestBernoulliNaiveBayes()
        {
            var stringTableBuilder = new StringTableBuilder();
            var data           = GetSimpleChineseSet(stringTableBuilder);
            var model          = data.TrainBernoulliNaiveBayes();
            var classifier     = model.CreateClassifier();
            var classification = classifier.Classify(GetTestRow(stringTableBuilder));

            Assert.IsTrue(classification.First() == "japan");
        }
Ejemplo n.º 12
0
        public void TestDecisionTree()
        {
            var stringTableBuilder = new StringTableBuilder();
            var data       = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder).ConvertToSparseVectors(false).ConvertToTable();
            var model      = data.TrainDecisionTree();
            var classifier = model.CreateClassifier();
            var testRows   = data.GetRows(new[] { 0, data.RowCount - 1 });

            Assert.IsTrue(classifier.Classify(testRows[0]).First() == "china");
            Assert.IsTrue(classifier.Classify(testRows[1]).First() == "japan");
        }
Ejemplo n.º 13
0
        public void TestRandomForest()
        {
            var stringTableBuilder = new StringTableBuilder();
            var data       = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder).ConvertToWeightedIndexList(false).ConvertToTable();
            var model      = data.TrainRandomForest();
            var classifier = model.CreateClassifier();
            var testRows   = data.GetRows(new[] { 0, data.RowCount - 1 });

            Assert.IsTrue(classifier.Classify(testRows[0]).GetBestClassification() == "china");
            //Assert.IsTrue(classifier.Classify(testRows[1]).First() == "japan");
        }
Ejemplo n.º 14
0
        private static string UsageParameters(Command command, string commandName)
        {
            var builder = new StringBuilder();

            builder.AppendLine(commandName == null ? "Parameters:" : $"{commandName} parameters:");

            var paramNames = command.Parameters.Keys.OrderBy(s => s).ToList();
            var table      = new StringTableBuilder();

            table.AddColumn("space0");
            table.AddColumn("name");
            table.AddColumn("space1");
            table.AddColumn("alias");
            table.AddColumn("space2");
            table.AddColumn("default-value");
            table.AddColumn("space3");
            table.AddColumn("type");
            table.AddColumn("space4");
            table.AddColumn("description");
            var headerRow = table.AddRow();

            headerRow.SetCell("space0", "  ");
            headerRow.SetCell("name", "name");
            headerRow.SetCell("space1", "  ");
            headerRow.SetCell("alias", "alias");
            headerRow.SetCell("space2", "  ");
            headerRow.SetCell("default-value", "default-value");
            headerRow.SetCell("space3", "  ");
            headerRow.SetCell("type", "type");
            headerRow.SetCell("space4", "  ");
            headerRow.SetCell("description", "description");
            foreach (var paramName in paramNames)
            {
                var param = command.Parameters[paramName];
                var row   = table.AddRow();
                row.SetCell("space0", "  ");
                row.SetCell("name", $"--{param.Name}");
                row.SetCell("space1", "  ");
                row.SetCell("alias", $"{AliasKey(param)}");
                row.SetCell("space2", "  ");
                row.SetCell("default-value", param.Optional ? $"{param.DefaultValue}" : "");
                row.SetCell("space3", "  ");
                row.SetCell("type", $"{param.Type.Name}");
                row.SetCell("space4", "  ");
                row.SetCell("description", $"{param.Description}");
            }
            builder.Append(table.ToString());

            return(builder.ToString());
        }
Ejemplo n.º 15
0
 public static ClassificationBag GetSimpleChineseSet(StringTableBuilder stringTableBuilder)
 {
     // sample data from: http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
     return(new ClassificationBag {
         Classification = new[] {
             Tuple.Create(new[] { "Chinese", "Beijing", "Chinese" }, true),
             Tuple.Create(new[] { "Chinese", "Chinese", "Shanghai" }, true),
             Tuple.Create(new[] { "Chinese", "Macao" }, true),
             Tuple.Create(new[] { "Tokyo", "Japan", "Chinese" }, false),
         }.Select(d => new IndexedClassification {
             Name = d.Item2 ? "china" : "japan",
             Data = d.Item1.Select(s => stringTableBuilder.GetIndex(s)).ToArray()
         }).ToArray()
     });
 }
Ejemplo n.º 16
0
        public void TestTFIDF()
        {
            var stringTableBuilder = new StringTableBuilder();
            var data = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder);

            Assert.AreEqual(data.Count, 4);
            Assert.AreEqual(data.First().Data.Count, 3);
            var set = data.ConvertToWeightedIndexList(true);

            Assert.AreEqual(set.Count, 2);
            Assert.AreEqual(set.First().Data.Count, 4);
            var tfidf = set.TFIDF();

            Assert.AreEqual(tfidf.Count, 2);
            Assert.AreEqual(tfidf.First().Data.Count, 4);
        }
Ejemplo n.º 17
0
        private string UsageCommands(string[] commandNames)
        {
            var table = new StringTableBuilder();

            table.AddColumn("col1");
            table.AddColumn("col2");
            foreach (var commandName in commandNames)
            {
                var command = _commands[commandName];
                var row     = table.AddRow();
                row.SetCell("col1", $"  {command.Name}");
                row.SetCell("col2", $"  {command.Description}");
            }

            return(table.ToString());
        }
Ejemplo n.º 18
0
 public void TestKMeans()
 {
     var stringTableBuilder = new StringTableBuilder();
     var data = NaiveBayesTests.GetSimpleChineseSet(stringTableBuilder)
                .ConvertToWeightedIndexList(false)
                .Vectorise()
                .ToDictionary(d => _lap.CreateVector(d.Data), d => d.Classification)
     ;
     var clusters = data
                    .Select(d => d.Key)
                    .ToList()
                    .KMeans(2)
     ;
     var clusterLabels = clusters
                         .Select(d => d.Select(d2 => data[d2]).ToArray())
                         .ToList()
     ;
 }
Ejemplo n.º 19
0
        private static string UsageFooter()
        {
            var table = new StringTableBuilder().SetInnerCellColumnPadding(1);

            table.AddColumn("col1");
            table.AddColumn("col2");
            table.AddColumn("col3");
            var row = table.AddRow();

            row.SetCell("col1", "<parameters>");
            row.SetCell("col2", ":=");
            row.SetCell("col3", "<p1-key> <p1-value> ... <pN-key> <pN-value>");
            row = table.AddRow();
            row.SetCell("col1", "<p-key>");
            row.SetCell("col2", ":=");
            row.SetCell("col3", "-<p-alias>|--<p-name>");

            return(table.ToString());
        }
Ejemplo n.º 20
0
        /// <summary>
        /// Classifies text into either positive or negative sentiment
        /// The data files can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
        /// </summary>
        /// <param name="dataFilesPath">Path to extracted data files</param>
        public static void SentimentClassification(string dataFilesPath)
        {
            var files = new[] {
                "amazon_cells_labelled.txt",
                "imdb_labelled.txt",
                "yelp_labelled.txt"
            };
            var LINE_SEPARATOR = "\n".ToCharArray();
            var SEPARATOR      = "\t".ToCharArray();
            var stringTable    = new StringTableBuilder();
            var sentimentData  = files.SelectMany(f => File.ReadAllText(dataFilesPath + f)
                                                  .Split(LINE_SEPARATOR)
                                                  .Where(l => !String.IsNullOrWhiteSpace(l))
                                                  .Select(l => l.Split(SEPARATOR))
                                                  .Select(s => Tuple.Create(_Tokenise(s[0]), s[1][0] == '1' ? "positive" : "negative"))
                                                  .Where(d => d.Item1.Any())
                                                  ).Shuffle(0).ToList();
            var splitSentimentData = sentimentData.Split();

            // build training and test classification bag
            var trainingClassificationBag = _BuildClassificationBag(splitSentimentData.Training, stringTable);
            var testClassificationBag     = _BuildClassificationBag(splitSentimentData.Test, stringTable);

            // train a bernoulli naive bayes classifier
            var bernoulli = trainingClassificationBag.TrainBernoulliNaiveBayes();

            Console.WriteLine("Bernoulli accuracy: {0:P}", testClassificationBag
                              .Classify(bernoulli.CreateClassifier())
                              .Average(r => r.Score)
                              );

            // train a multinomial naive bayes classifier
            var multinomial = trainingClassificationBag.TrainMultinomialNaiveBayes();

            Console.WriteLine("Multinomial accuracy: {0:P}", testClassificationBag
                              .Classify(multinomial.CreateClassifier())
                              .Average(r => r.Score)
                              );

            // convert the bags to sparse vectors
            var sentimentDataBag        = _BuildClassificationBag(sentimentData, stringTable);
            var sentimentDataSet        = sentimentDataBag.ConvertToSparseVectors(false);
            var sentimentDataTableSplit = sentimentDataSet.Split();

            using (var lap = GPUProvider.CreateLinearAlgebra(false)) {
                var maxIndex            = sentimentDataSet.GetMaximumIndex() + 1;
                var trainingData        = sentimentDataTableSplit.Training.CreateTrainingDataProvider(lap, maxIndex);
                var testData            = sentimentDataTableSplit.Test.CreateTrainingDataProvider(lap, maxIndex);
                var classificationTable = sentimentDataSet.GetClassifications().ToDictionary(d => (int)d.Value, d => d.Key);

                // create the three classifiers
                var bernoulliClassifier   = bernoulli.CreateClassifier();
                var multinomialClassifier = multinomial.CreateClassifier();
                var neuralClassifier      = lap.NN.CreateFeedForward(lap.NN.CreateTrainingContext(ErrorMetricType.OneHot, learningRate: 0.1f, batchSize: 128)
                                                                     .TrainNeuralNetwork(lap, trainingData, testData, new LayerDescriptor(0.1f)
                {
                    WeightUpdate         = WeightUpdateType.Adam,
                    Activation           = ActivationType.Relu,
                    WeightInitialisation = WeightInitialisationType.Xavier,
                    LayerTrainer         = LayerTrainerType.Dropout
                }, hiddenLayerSize: 512, numEpochs: 10)
                                                                     );

                // create the stacked training set
                Console.WriteLine("Creating model stack data set...");
                var modelStacker = new ModelStacker();
                foreach (var item in sentimentDataSet.Classification)
                {
                    var indexList = item.GetIndexList();
                    modelStacker.Add(new[] {
                        bernoulliClassifier.GetWeightedClassifications(indexList),
                        multinomialClassifier.GetWeightedClassifications(indexList),
                        neuralClassifier.GetWeightedClassifications(item.Vectorise(maxIndex), classificationTable)
                    }, item.Name);
                }

                // convert the stacked data to a data table and split it into training and test sets
                var sentimentDataTable  = modelStacker.GetTable();
                var dataTableVectoriser = sentimentDataTable.GetVectoriser();
                var split             = sentimentDataTable.Split();
                var trainingStack     = lap.NN.CreateTrainingDataProvider(split.Training, dataTableVectoriser);
                var testStack         = lap.NN.CreateTrainingDataProvider(split.Test, dataTableVectoriser);
                var targetColumnIndex = sentimentDataTable.TargetColumnIndex;

                // train a neural network on the stacked data
                var trainingContext = lap.NN.CreateTrainingContext(ErrorMetricType.OneHot, learningRate: 0.3f, batchSize: 8);
                trainingContext.ScheduleTrainingRateChange(10, 0.1f);
                var stackNN = lap.NN.CreateFeedForward(trainingContext.TrainNeuralNetwork(lap, trainingStack, testStack, new LayerDescriptor(0.1f)
                {
                    WeightUpdate         = WeightUpdateType.RMSprop,
                    Activation           = ActivationType.LeakyRelu,
                    WeightInitialisation = WeightInitialisationType.Xavier
                }, hiddenLayerSize: 32, numEpochs: 20));

                uint stringIndex;
                Console.WriteLine("Enter some text to test the classifiers...");
                while (true)
                {
                    Console.Write(">");
                    var line = Console.ReadLine();
                    if (String.IsNullOrWhiteSpace(line))
                    {
                        break;
                    }

                    var tokens    = _Tokenise(line);
                    var indexList = new List <uint>();
                    foreach (var token in tokens)
                    {
                        if (stringTable.TryGetIndex(token, out stringIndex))
                        {
                            indexList.Add(stringIndex);
                        }
                    }
                    if (indexList.Any())
                    {
                        var queryTokens = indexList.GroupBy(d => d).Select(g => Tuple.Create(g.Key, (float)g.Count())).ToList();
                        var vector      = new float[maxIndex];
                        foreach (var token in queryTokens)
                        {
                            vector[token.Item1] = token.Item2;
                        }

                        Console.WriteLine("Bernoulli classification: " + bernoulliClassifier.Classify(indexList).First());
                        Console.WriteLine("Multinomial classification: " + multinomialClassifier.Classify(indexList).First());
                        Console.WriteLine("Neural network classification: " + classificationTable[neuralClassifier.Execute(vector).MaximumIndex()]);

                        var stackInput = modelStacker.Vectorise(new[] {
                            bernoulliClassifier.GetWeightedClassifications(indexList),
                            multinomialClassifier.GetWeightedClassifications(indexList),
                            neuralClassifier.GetWeightedClassifications(vector, classificationTable)
                        });
                        Console.WriteLine("Stack classification: " + dataTableVectoriser.GetOutputLabel(targetColumnIndex, stackNN.Execute(stackInput).MaximumIndex()));
                    }
                    else
                    {
                        Console.WriteLine("Sorry, none of those words have been seen before.");
                    }
                    Console.WriteLine();
                }
            }
            Console.WriteLine();
        }
Ejemplo n.º 21
0
 static ClassificationBag _BuildClassificationBag(IReadOnlyList <Tuple <string[], string> > data, StringTableBuilder stringTable)
 {
     return(new ClassificationBag {
         Classification = data.Select(d => new IndexedClassification {
             Name = d.Item2,
             Data = d.Item1.Select(str => stringTable.GetIndex(str)).ToArray()
         }).ToArray()
     });
 }
Ejemplo n.º 22
0
 public static IReadOnlyList <(string Label, IndexList Data)> GetSimpleChineseSet(StringTableBuilder stringTableBuilder)
 {
     // sample data from: http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
     var data = new[] {
Ejemplo n.º 23
0
        void _ClusterDataset()
        {
            Dispatcher.Invoke((SimpleDelegate) delegate() {
                _statusMessage.Add("Downloading dataset...");
            });
            var uri           = new Uri("https://archive.ics.uci.edu/ml/machine-learning-databases/00307/%5bUCI%5d%20AAAI-14%20Accepted%20Papers%20-%20Papers.csv");
            var KEYWORD_SPLIT = " \n".ToCharArray();
            var TOPIC_SPLIT   = "\n".ToCharArray();

            // download the document list
            var docList = new List <AAAIDocument>();

            using (var client = new WebClient()) {
                var data = client.DownloadData(uri);

                Dispatcher.Invoke((SimpleDelegate) delegate() {
                    _statusMessage.Add("Building data table...");
                });

                // parse the file CSV
                var dataTable = new StreamReader(new MemoryStream(data)).ParseCSV(',');

                // create strongly typed documents from the data table
                dataTable.ForEach(row => docList.Add(new AAAIDocument {
                    Abstract = row.GetField <string>(5),
                    Keyword  = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(),
                    Topic    = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Group    = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Title    = row.GetField <string>(0)
                }));
            }

            // create a document lookup table
            var docTable = docList.ToDictionary(d => d.Title, d => d);

            // extract features from the document's metadata
            var stringTable       = new StringTableBuilder();
            var classificationSet = new SparseVectorClassificationSet {
                Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray()
            };

            // normalise the document/t
            var encodings = classificationSet.Vectorise(true);

            // convert the sparse feature vectors into dense vectors
            var documentClusterList = new List <DocumentCluster>();

            using (var lap = Provider.CreateLinearAlgebra()) {
                var lookupTable = encodings
                                  .Select(d => Tuple.Create(d, lap.Create(d.Data).AsIndexable()))
                                  .ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification])
                ;
                var vectorList = lookupTable.Select(d => d.Key).ToList();

                Dispatcher.Invoke((SimpleDelegate) delegate() {
                    _statusMessage.Add("Clustering data...");
                });

                // cluster the dense vectors
                using (var nnmf = new NNMF(lap, vectorList, _clusterColour.Length)) {
                    var clusters = nnmf.Cluster(40, cost => {
                        Dispatcher.Invoke((SimpleDelegate) delegate() {
                            _statusMessage.Add("NNMF error: " + cost.ToString());
                        });
                    });

                    // create document clusters from the NNMF results
                    int index = 0;
                    foreach (var cluster in clusters)
                    {
                        var documentCluster = new List <AAAIDocument>();
                        foreach (var item in cluster)
                        {
                            var document = lookupTable[item];
                            documentCluster.Add(document);
                        }
                        var desc = String.Join(", ", nnmf.GetRankedFeatures(index++)
                                               .Select(i => stringTable.GetString(i))
                                               .Take(32)
                                               );
                        documentClusterList.Add(new DocumentCluster(documentCluster, desc));
                    }

                    // collect the cluster membership for each document
                    for (int i = 0, len = vectorList.Count; i < len; i++)
                    {
                        lookupTable[vectorList[i]].ClusterMembership = nnmf.GetClusterMembership(i);
                    }
                }
            }

            Dispatcher.Invoke((SimpleDelegate) delegate() {
                _UpdateUI(documentClusterList);
            });
        }
        /// <summary>
        /// Classifies text into either positive or negative sentiment
        /// The data files can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
        /// </summary>
        /// <param name="dataFilesPath">Path to extracted data files</param>
        public static void SentimentClassification(string dataFilesPath)
        {
            var files          = new[] { "amazon_cells_labelled.txt", "imdb_labelled.txt", "yelp_labelled.txt" };
            var LINE_SEPARATOR = "\n".ToCharArray();
            var SEPARATOR      = "\t".ToCharArray();
            var stringTable    = new StringTableBuilder();
            var sentimentData  = files.SelectMany(f =>
                                                  File.ReadAllText(dataFilesPath + f).Split(LINE_SEPARATOR).
                                                  Where(l => !string.IsNullOrWhiteSpace(l)).Select(l => l.Split(SEPARATOR)).
                                                  Select(s => Tuple.Create(Tokenise(s[0]), s[1][0] == '1' ? "positive" : "negative")).
                                                  Where(d => d.Item1.Any())).Shuffle(0).ToList();
            var splitSentimentData = sentimentData.Split();

            // build training and test classification bag
            var trainingClassificationBag =
                BuildIndexedClassifications(splitSentimentData.Training, stringTable);
            var testClassificationBag =
                BuildIndexedClassifications(splitSentimentData.Test, stringTable);

            // train a bernoulli naive bayes classifier
            var bernoulli = trainingClassificationBag.TrainBernoulliNaiveBayes();

            Console.WriteLine("Bernoulli accuracy: {0:P}",
                              testClassificationBag.Classify(bernoulli.CreateClassifier()).Average(r => r.Score));

            // train a multinomial naive bayes classifier
            var multinomial = trainingClassificationBag.TrainMultinomialNaiveBayes();

            Console.WriteLine("Multinomial accuracy: {0:P}",
                              testClassificationBag.Classify(multinomial.CreateClassifier()).Average(r => r.Score));

            // convert the index lists to vectors and normalise along the way
            var sentimentDataTable = BuildIndexedClassifications(sentimentData, stringTable).
                                     ConvertToTable().Normalise(NormalisationType.Standard);
            var vectoriser        = sentimentDataTable.GetVectoriser();
            var sentimentDataSet  = sentimentDataTable.Split(0);
            var dataTableAnalysis = sentimentDataTable.GetAnalysis();

            using (var lap = BrightWireProvider.CreateLinearAlgebra())
            {
                var graph            = new GraphFactory(lap);
                var trainingData     = graph.CreateDataSource(sentimentDataSet.Training, vectoriser);
                var testData         = graph.CreateDataSource(sentimentDataSet.Test, vectoriser);
                var indexListEncoder = (IIndexListEncoder)trainingData;

                // use a one hot encoding error metric, rmsprop gradient descent and xavier weight initialisation
                var errorMetric = graph.ErrorMetric.OneHotEncoding;
                var propertySet = graph.CurrentPropertySet.Use(graph.GradientDescent.RmsProp).
                                  Use(graph.WeightInitialisation.Xavier);
                var engine = graph.CreateTrainingEngine(trainingData, 0.3f);
                engine.LearningContext.ScheduleLearningRate(5, 0.1f);
                engine.LearningContext.ScheduleLearningRate(11, 1f);
                engine.LearningContext.ScheduleLearningRate(15, 0.3f);

                // train a neural network classifier
                var neuralNetworkWire = graph.Connect(engine).AddFeedForward(512, "layer1")
                                        //.AddBatchNormalisation()
                                        .Add(graph.ReluActivation()).AddDropOut(0.5f).
                                        AddFeedForward(trainingData.OutputSize, "layer2").Add(graph.ReluActivation()).
                                        AddBackpropagation(errorMetric, "first-network");

                // train the network
                Console.WriteLine("Training neural network classifier...");
                const int  TRAINING_ITERATIONS = 10;
                GraphModel bestNetwork         = null;
                engine.Train(TRAINING_ITERATIONS, testData, errorMetric, network => bestNetwork = network);
                if (bestNetwork != null)
                {
                    engine.LoadParametersFrom(bestNetwork.Graph);
                }
                var firstClassifier = graph.CreateEngine(engine.Graph);

                // stop the backpropagation to the first neural network
                engine.LearningContext.EnableNodeUpdates(neuralNetworkWire.Find("layer1"), false);
                engine.LearningContext.EnableNodeUpdates(neuralNetworkWire.Find("layer2"), false);

                // create the bernoulli classifier wire
                var bernoulliClassifier = bernoulli.CreateClassifier();
                var bernoulliWire       = graph.Connect(engine).AddClassifier(bernoulliClassifier,
                                                                              sentimentDataSet.Training, dataTableAnalysis);

                // create the multinomial classifier wire
                var multinomialClassifier = multinomial.CreateClassifier();
                var multinomialWire       = graph.Connect(engine).AddClassifier(multinomialClassifier,
                                                                                sentimentDataSet.Training, dataTableAnalysis);

                // join the bernoulli, multinomial and neural network classification outputs
                var firstNetwork = neuralNetworkWire.Find("first-network");
                var joined       = graph.Join(multinomialWire,
                                              graph.Join(bernoulliWire, graph.Connect(trainingData.OutputSize, firstNetwork)));

                // train an additional classifier on the output of the previous three classifiers
                joined.AddFeedForward(outputSize: 64).Add(graph.ReluActivation()).
                AddDropOut(dropOutPercentage: 0.5f).AddFeedForward(trainingData.OutputSize).
                Add(graph.ReluActivation()).AddBackpropagation(errorMetric);

                // train the network again
                Console.WriteLine("Training stacked neural network classifier...");
                GraphModel bestStackedNetwork = null;
                engine.Train(10, testData, errorMetric, network => bestStackedNetwork = network);
                if (bestStackedNetwork != null)
                {
                    engine.LoadParametersFrom(bestStackedNetwork.Graph);
                }
                Console.WriteLine("Enter some text to test the classifiers...");
                while (true)
                {
                    Console.Write(">");
                    var line = Console.ReadLine();
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        break;
                    }
                    var tokens    = Tokenise(line);
                    var indexList = new List <uint>();
                    foreach (var token in tokens)
                    {
                        if (stringTable.TryGetIndex(token, out uint stringIndex))
                        {
                            indexList.Add(stringIndex);
                        }
                    }

                    if (indexList.Any())
                    {
                        var queryTokens = indexList.GroupBy(d => d).
                                          Select(g => Tuple.Create(g.Key, (float)g.Count())).ToList();
                        var vector = new float[trainingData.InputSize];
                        foreach (var token in queryTokens)
                        {
                            vector[token.Item1] = token.Item2;
                        }
                        var indexList2   = IndexList.Create(indexList.ToArray());
                        var encodedInput = indexListEncoder.Encode(indexList2);
                        Console.WriteLine("Bernoulli classification: " +
                                          bernoulliClassifier.Classify(indexList2).First().Label);
                        Console.WriteLine("Multinomial classification: " +
                                          multinomialClassifier.Classify(indexList2).First().Label);
                        var result         = firstClassifier.Execute(encodedInput);
                        var classification = vectoriser.GetOutputLabel(1,
                                                                       (result.Output[0].Data[0] > result.Output[0].Data[1]) ? 0 : 1);
                        Console.WriteLine("Neural network classification: " + classification);
                        var stackedResult         = engine.Execute(encodedInput);
                        var stackedClassification = vectoriser.GetOutputLabel(1,
                                                                              (stackedResult.Output[0].Data[0] > stackedResult.Output[0].Data[1]) ? 0 : 1);
                        Console.WriteLine("Stack classification: " + stackedClassification);
                    }
                    else
                    {
                        Console.WriteLine("Sorry, none of those words have been seen before.");
                    }

                    Console.WriteLine();
                }
            }

            Console.WriteLine();
        }
 private static IReadOnlyList <(string Classification, IndexList Data)> BuildIndexedClassifications(
     IReadOnlyList <Tuple <string[], string> > data, StringTableBuilder stringTable)
 {
     return(data.Select(d => (d.Item2,
                              IndexList.Create(d.Item1.Select(str => stringTable.GetIndex(str)).ToArray()))).ToList());
 }
Ejemplo n.º 26
0
        void _AnalyseDataset()
        {
            Dispatcher.Invoke(() => {
                _statusMessage.Add("Downloading dataset...");
            });
            var uri           = new Uri("https://archive.ics.uci.edu/ml/machine-learning-databases/00307/%5bUCI%5d%20AAAI-14%20Accepted%20Papers%20-%20Papers.csv");
            var KEYWORD_SPLIT = " \n".ToCharArray();
            var TOPIC_SPLIT   = "\n".ToCharArray();

            // download the document list
            var docList = new List <AAAIDocument>();

            using (var client = new WebClient()) {
                var data = client.DownloadData(uri);

                Dispatcher.Invoke(() => {
                    _statusMessage.Add("Building data table...");
                });

                // parse the file CSV
                var dataTable = new StreamReader(new MemoryStream(data)).ParseCSV(',');

                // create strongly typed documents from the data table
                dataTable.ForEach(row => docList.Add(new AAAIDocument {
                    Abstract = row.GetField <string>(5),
                    Keyword  = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(),
                    Topic    = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Group    = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Title    = row.GetField <string>(0)
                }));
            }

            // create a document lookup table
            var docTable = docList.ToDictionary(d => d.Title, d => d);

            // extract features from the document's metadata
            var stringTable       = new StringTableBuilder();
            var classificationSet = new SparseVectorClassificationSet {
                Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray()
            };

            // create dense feature vectors and normalise�along the way
            var encodings = classificationSet.Vectorise(true);

            using (var lap = Provider.CreateLinearAlgebra()) {
                var lookupTable = encodings.Select(d => Tuple.Create(d, lap.Create(d.Data))).ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]);
                var vectorList  = lookupTable.Select(d => d.Key).ToList();

                // create a term/document matrix with terms as columns and documents as rows
                var matrix = lap.CreateMatrix(vectorList.Select(d => d.Data).ToList());

                Dispatcher.Invoke(() => {
                    _statusMessage.Add("Performing latent semantic analysis...");
                });

                // compute the SVD
                const int K        = 3;
                var       kIndices = Enumerable.Range(0, K).ToList();
                var       matrixT  = matrix.Transpose();
                var       svd      = matrixT.Svd();

                // create latent space
                var s  = lap.CreateDiagonal(svd.S.AsIndexable().Values.Take(K).ToList());
                var v2 = svd.VT.GetNewMatrixFromRows(kIndices);
                using (var sv2 = s.Multiply(v2)) {
                    var vectorList2  = sv2.AsIndexable().Columns.ToList();
                    var lookupTable2 = vectorList2.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]);

                    // cluster the latent space
                    var clusters     = vectorList2.KMeans(COLOUR_LIST.Length);
                    var clusterTable = clusters
                                       .Select((l, i) => Tuple.Create(l, i))
                                       .SelectMany(d => d.Item1.Select(v => Tuple.Create(v, d.Item2)))
                                       .ToDictionary(d => d.Item1, d => COLOUR_LIST[d.Item2])
                    ;

                    // build the document list
                    var    documentList = new List <Document>();
                    int    index = 0;
                    double maxX = double.MinValue, minX = double.MaxValue, maxY = double.MinValue, minY = double.MaxValue, maxZ = double.MinValue, minZ = double.MaxValue;
                    foreach (var item in vectorList2)
                    {
                        float x = item[0];
                        float y = item[1];
                        float z = item[2];
                        documentList.Add(new Document(x, y, z, index++, lookupTable2[item], clusterTable[item]));
                        if (x > maxX)
                        {
                            maxX = x;
                        }
                        if (x < minX)
                        {
                            minX = x;
                        }
                        if (y > maxY)
                        {
                            maxY = y;
                        }
                        if (y < minY)
                        {
                            minY = y;
                        }
                        if (z > maxZ)
                        {
                            maxZ = z;
                        }
                        if (z < minZ)
                        {
                            minZ = z;
                        }
                    }
                    double rangeX = maxX - minX;
                    double rangeY = maxY - minY;
                    double rangeZ = maxZ - minZ;
                    foreach (var document in documentList)
                    {
                        document.Normalise(minX, rangeX, minY, rangeY, minZ, rangeZ);
                    }

                    Dispatcher.Invoke(() => {
                        var numDocs   = documentList.Count;
                        _cube         = new Cube[numDocs];
                        _searchResult = new SearchResult[numDocs];

                        _statusMessage.Add("Creating 3D graph...");

                        var SCALE = 10;
                        for (var i = 0; i < numDocs; i++)
                        {
                            var document        = documentList[i];
                            var cube            = _cube[i] = new Cube(SCALE * document.X, SCALE * document.Y, SCALE * document.Z, i);
                            var searchResult    = _searchResult[i] = new SearchResult(document.AAAIDocument, i);
                            cube.Colour         = document.Colour;
                            searchResult.Colour = document.Colour;

                            searchResult.MouseHoverEvent += new SearchResult.MouseHoverDelegate(searchResult_MouseHoverEvent);
                            viewPort.Children.Add(cube);
                        }

                        foreach (var item in _searchResult.OrderBy(sr => sr.Colour.GetHashCode()))
                        {
                            panelResults.Children.Add(item);
                        }

                        icStatus.Visibility = Visibility.Collapsed;
                        viewPort.Visibility = Visibility.Visible;
                        progress.Visibility = Visibility.Collapsed;
                    });
                }
            }
        }
Ejemplo n.º 27
0
        /// <summary>
        /// Cluster a tagged set of documents
        /// Can be downloaded from https://archive.ics.uci.edu/ml/machine-learning-databases/00307/
        /// </summary>
        /// <param name="dataFilePath">The path to the data file</param>
        /// <param name="outputPath">A directory to write the output files to</param>
        public static void TextClustering(string dataFilePath, string outputPath)
        {
            IDataTable dataTable;

            using (var reader = new StreamReader(dataFilePath)) {
                dataTable = reader.ParseCSV();
            }

            var KEYWORD_SPLIT = " \n".ToCharArray();
            var TOPIC_SPLIT   = "\n".ToCharArray();

            var docList = new List <AAAIDocument>();

            dataTable.ForEach(row => docList.Add(new AAAIDocument {
                Abstract = row.GetField <string>(5),
                Keyword  = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(),
                Topic    = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                Group    = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                Title    = row.GetField <string>(0)
            }));
            var docTable  = docList.ToDictionary(d => d.Title, d => d);
            var allGroups = new HashSet <string>(docList.SelectMany(d => d.Group));

            var stringTable       = new StringTableBuilder();
            var classificationSet = new SparseVectorClassificationSet {
                Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray()
            };
            var encodings = classificationSet.Vectorise(true);

            using (var lap = Provider.CreateLinearAlgebra()) {
                var lookupTable = encodings.Select(d => Tuple.Create(d, lap.Create(d.Data))).ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]);
                var vectorList  = lookupTable.Select(d => d.Key).ToList();

                Console.WriteLine("Kmeans clustering...");
                _WriteClusters(outputPath + "kmeans.txt", vectorList.KMeans(allGroups.Count), lookupTable);

                Console.WriteLine("NNMF clustering...");
                _WriteClusters(outputPath + "nnmf.txt", vectorList.NNMF(lap, allGroups.Count, 100), lookupTable);

                // create a term/document matrix with terms as columns and documents as rows
                var matrix = lap.CreateMatrix(vectorList.Select(v => v.Data).ToList());
                vectorList.ForEach(v => v.Dispose());

                Console.WriteLine("Creating random projection...");
                using (var randomProjection = lap.CreateRandomProjection((int)classificationSet.GetMaximumIndex() + 1, 512)) {
                    using (var projectedMatrix = randomProjection.Compute(matrix)) {
                        var vectorList2  = Enumerable.Range(0, projectedMatrix.RowCount).Select(i => projectedMatrix.Row(i)).ToList();
                        var lookupTable2 = vectorList2.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]);

                        Console.WriteLine("Kmeans clustering of random projection...");
                        _WriteClusters(outputPath + "projected-kmeans.txt", vectorList2.KMeans(allGroups.Count), lookupTable2);
                        vectorList2.ForEach(v => v.Dispose());
                    }
                }

                Console.WriteLine("Building latent term/document space...");
                const int K        = 256;
                var       kIndices = Enumerable.Range(0, K).ToList();
                var       matrixT  = matrix.Transpose();
                matrix.Dispose();
                var svd = matrixT.Svd();
                matrixT.Dispose();

                var s  = lap.CreateDiagonal(svd.S.AsIndexable().Values.Take(K).ToList());
                var v2 = svd.VT.GetNewMatrixFromRows(kIndices);
                svd.Dispose();
                using (var sv2 = s.Multiply(v2)) {
                    v2.Dispose();
                    s.Dispose();

                    var vectorList3  = sv2.AsIndexable().Columns.ToList();
                    var lookupTable3 = vectorList3.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]);

                    Console.WriteLine("Kmeans clustering in latent document space...");
                    _WriteClusters(outputPath + "latent-kmeans.txt", vectorList3.KMeans(allGroups.Count), lookupTable3);
                }
            }
        }
Ejemplo n.º 28
0
 public static IReadOnlyList <uint> GetTestRow(StringTableBuilder stringTableBuilder)
 {
     return(new[] { "Chinese", "Chinese", "Chinese", "Tokyo", "Japan" }.Select(s => stringTableBuilder.GetIndex(s)).ToArray());
 }