internal SparseVectorClassificationSetSplit(SequenceSplit <SparseVectorClassification> split)
 {
     Training = new SparseVectorClassificationSet {
         Classification = split.Training.ToArray()
     };
     Test = new SparseVectorClassificationSet {
         Classification = split.Test.ToArray()
     };
 }
Esempio n. 2
0
        public WeightedClassificationSetDataProvider(ILinearAlgebraProvider lap, SparseVectorClassificationSet data, uint maxIndex)
        {
            _lap = lap;
            var classifications = data.GetClassifications();

            _classification = classifications.ToDictionary(d => d.Value, d => d.Key);
            _data           = data.Classification.Select(c => Tuple.Create(c.Data.ToDictionary(d => (int)d.Index, d => d.Weight), classifications[c.Name])).ToList();
            _inputSize      = (int)maxIndex;
            _outputSize     = classifications.Count;
        }
Esempio n. 3
0
        /// <summary>
        /// Cluster a tagged set of documents
        /// Can be downloaded from https://archive.ics.uci.edu/ml/machine-learning-databases/00307/
        /// </summary>
        /// <param name="dataFilePath">The path to the data file</param>
        /// <param name="outputPath">A directory to write the output files to</param>
        public static void TextClustering(string dataFilePath, string outputPath)
        {
            IDataTable dataTable;

            using (var reader = new StreamReader(dataFilePath)) {
                dataTable = reader.ParseCSV();
            }

            var KEYWORD_SPLIT = " \n".ToCharArray();
            var TOPIC_SPLIT   = "\n".ToCharArray();

            var docList = new List <AAAIDocument>();

            dataTable.ForEach(row => docList.Add(new AAAIDocument {
                Abstract = row.GetField <string>(5),
                Keyword  = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(),
                Topic    = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                Group    = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                Title    = row.GetField <string>(0)
            }));
            var docTable  = docList.ToDictionary(d => d.Title, d => d);
            var allGroups = new HashSet <string>(docList.SelectMany(d => d.Group));

            var stringTable       = new StringTableBuilder();
            var classificationSet = new SparseVectorClassificationSet {
                Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray()
            };
            var encodings = classificationSet.Vectorise(true);

            using (var lap = Provider.CreateLinearAlgebra()) {
                var lookupTable = encodings.Select(d => Tuple.Create(d, lap.Create(d.Data))).ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]);
                var vectorList  = lookupTable.Select(d => d.Key).ToList();

                Console.WriteLine("Kmeans clustering...");
                _WriteClusters(outputPath + "kmeans.txt", vectorList.KMeans(allGroups.Count), lookupTable);

                Console.WriteLine("NNMF clustering...");
                _WriteClusters(outputPath + "nnmf.txt", vectorList.NNMF(lap, allGroups.Count, 100), lookupTable);

                // create a term/document matrix with terms as columns and documents as rows
                var matrix = lap.CreateMatrix(vectorList.Select(v => v.Data).ToList());
                vectorList.ForEach(v => v.Dispose());

                Console.WriteLine("Creating random projection...");
                using (var randomProjection = lap.CreateRandomProjection((int)classificationSet.GetMaximumIndex() + 1, 512)) {
                    using (var projectedMatrix = randomProjection.Compute(matrix)) {
                        var vectorList2  = Enumerable.Range(0, projectedMatrix.RowCount).Select(i => projectedMatrix.Row(i)).ToList();
                        var lookupTable2 = vectorList2.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]);

                        Console.WriteLine("Kmeans clustering of random projection...");
                        _WriteClusters(outputPath + "projected-kmeans.txt", vectorList2.KMeans(allGroups.Count), lookupTable2);
                        vectorList2.ForEach(v => v.Dispose());
                    }
                }

                Console.WriteLine("Building latent term/document space...");
                const int K        = 256;
                var       kIndices = Enumerable.Range(0, K).ToList();
                var       matrixT  = matrix.Transpose();
                matrix.Dispose();
                var svd = matrixT.Svd();
                matrixT.Dispose();

                var s  = lap.CreateDiagonal(svd.S.AsIndexable().Values.Take(K).ToList());
                var v2 = svd.VT.GetNewMatrixFromRows(kIndices);
                svd.Dispose();
                using (var sv2 = s.Multiply(v2)) {
                    v2.Dispose();
                    s.Dispose();

                    var vectorList3  = sv2.AsIndexable().Columns.ToList();
                    var lookupTable3 = vectorList3.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]);

                    Console.WriteLine("Kmeans clustering in latent document space...");
                    _WriteClusters(outputPath + "latent-kmeans.txt", vectorList3.KMeans(allGroups.Count), lookupTable3);
                }
            }
        }
Esempio n. 4
0
        void _AnalyseDataset()
        {
            Dispatcher.Invoke(() => {
                _statusMessage.Add("Downloading dataset...");
            });
            var uri           = new Uri("https://archive.ics.uci.edu/ml/machine-learning-databases/00307/%5bUCI%5d%20AAAI-14%20Accepted%20Papers%20-%20Papers.csv");
            var KEYWORD_SPLIT = " \n".ToCharArray();
            var TOPIC_SPLIT   = "\n".ToCharArray();

            // download the document list
            var docList = new List <AAAIDocument>();

            using (var client = new WebClient()) {
                var data = client.DownloadData(uri);

                Dispatcher.Invoke(() => {
                    _statusMessage.Add("Building data table...");
                });

                // parse the file CSV
                var dataTable = new StreamReader(new MemoryStream(data)).ParseCSV(',');

                // create strongly typed documents from the data table
                dataTable.ForEach(row => docList.Add(new AAAIDocument {
                    Abstract = row.GetField <string>(5),
                    Keyword  = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(),
                    Topic    = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Group    = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Title    = row.GetField <string>(0)
                }));
            }

            // create a document lookup table
            var docTable = docList.ToDictionary(d => d.Title, d => d);

            // extract features from the document's metadata
            var stringTable       = new StringTableBuilder();
            var classificationSet = new SparseVectorClassificationSet {
                Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray()
            };

            // create dense feature vectors and normalise�along the way
            var encodings = classificationSet.Vectorise(true);

            using (var lap = Provider.CreateLinearAlgebra()) {
                var lookupTable = encodings.Select(d => Tuple.Create(d, lap.Create(d.Data))).ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]);
                var vectorList  = lookupTable.Select(d => d.Key).ToList();

                // create a term/document matrix with terms as columns and documents as rows
                var matrix = lap.CreateMatrix(vectorList.Select(d => d.Data).ToList());

                Dispatcher.Invoke(() => {
                    _statusMessage.Add("Performing latent semantic analysis...");
                });

                // compute the SVD
                const int K        = 3;
                var       kIndices = Enumerable.Range(0, K).ToList();
                var       matrixT  = matrix.Transpose();
                var       svd      = matrixT.Svd();

                // create latent space
                var s  = lap.CreateDiagonal(svd.S.AsIndexable().Values.Take(K).ToList());
                var v2 = svd.VT.GetNewMatrixFromRows(kIndices);
                using (var sv2 = s.Multiply(v2)) {
                    var vectorList2  = sv2.AsIndexable().Columns.ToList();
                    var lookupTable2 = vectorList2.Select((v, i) => Tuple.Create(v, vectorList[i])).ToDictionary(d => (IVector)d.Item1, d => lookupTable[d.Item2]);

                    // cluster the latent space
                    var clusters     = vectorList2.KMeans(COLOUR_LIST.Length);
                    var clusterTable = clusters
                                       .Select((l, i) => Tuple.Create(l, i))
                                       .SelectMany(d => d.Item1.Select(v => Tuple.Create(v, d.Item2)))
                                       .ToDictionary(d => d.Item1, d => COLOUR_LIST[d.Item2])
                    ;

                    // build the document list
                    var    documentList = new List <Document>();
                    int    index = 0;
                    double maxX = double.MinValue, minX = double.MaxValue, maxY = double.MinValue, minY = double.MaxValue, maxZ = double.MinValue, minZ = double.MaxValue;
                    foreach (var item in vectorList2)
                    {
                        float x = item[0];
                        float y = item[1];
                        float z = item[2];
                        documentList.Add(new Document(x, y, z, index++, lookupTable2[item], clusterTable[item]));
                        if (x > maxX)
                        {
                            maxX = x;
                        }
                        if (x < minX)
                        {
                            minX = x;
                        }
                        if (y > maxY)
                        {
                            maxY = y;
                        }
                        if (y < minY)
                        {
                            minY = y;
                        }
                        if (z > maxZ)
                        {
                            maxZ = z;
                        }
                        if (z < minZ)
                        {
                            minZ = z;
                        }
                    }
                    double rangeX = maxX - minX;
                    double rangeY = maxY - minY;
                    double rangeZ = maxZ - minZ;
                    foreach (var document in documentList)
                    {
                        document.Normalise(minX, rangeX, minY, rangeY, minZ, rangeZ);
                    }

                    Dispatcher.Invoke(() => {
                        var numDocs   = documentList.Count;
                        _cube         = new Cube[numDocs];
                        _searchResult = new SearchResult[numDocs];

                        _statusMessage.Add("Creating 3D graph...");

                        var SCALE = 10;
                        for (var i = 0; i < numDocs; i++)
                        {
                            var document        = documentList[i];
                            var cube            = _cube[i] = new Cube(SCALE * document.X, SCALE * document.Y, SCALE * document.Z, i);
                            var searchResult    = _searchResult[i] = new SearchResult(document.AAAIDocument, i);
                            cube.Colour         = document.Colour;
                            searchResult.Colour = document.Colour;

                            searchResult.MouseHoverEvent += new SearchResult.MouseHoverDelegate(searchResult_MouseHoverEvent);
                            viewPort.Children.Add(cube);
                        }

                        foreach (var item in _searchResult.OrderBy(sr => sr.Colour.GetHashCode()))
                        {
                            panelResults.Children.Add(item);
                        }

                        icStatus.Visibility = Visibility.Collapsed;
                        viewPort.Visibility = Visibility.Visible;
                        progress.Visibility = Visibility.Collapsed;
                    });
                }
            }
        }
Esempio n. 5
0
        void _ClusterDataset()
        {
            Dispatcher.Invoke((SimpleDelegate) delegate() {
                _statusMessage.Add("Downloading dataset...");
            });
            var uri           = new Uri("https://archive.ics.uci.edu/ml/machine-learning-databases/00307/%5bUCI%5d%20AAAI-14%20Accepted%20Papers%20-%20Papers.csv");
            var KEYWORD_SPLIT = " \n".ToCharArray();
            var TOPIC_SPLIT   = "\n".ToCharArray();

            // download the document list
            var docList = new List <AAAIDocument>();

            using (var client = new WebClient()) {
                var data = client.DownloadData(uri);

                Dispatcher.Invoke((SimpleDelegate) delegate() {
                    _statusMessage.Add("Building data table...");
                });

                // parse the file CSV
                var dataTable = new StreamReader(new MemoryStream(data)).ParseCSV(',');

                // create strongly typed documents from the data table
                dataTable.ForEach(row => docList.Add(new AAAIDocument {
                    Abstract = row.GetField <string>(5),
                    Keyword  = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(),
                    Topic    = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Group    = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries),
                    Title    = row.GetField <string>(0)
                }));
            }

            // create a document lookup table
            var docTable = docList.ToDictionary(d => d.Title, d => d);

            // extract features from the document's metadata
            var stringTable       = new StringTableBuilder();
            var classificationSet = new SparseVectorClassificationSet {
                Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray()
            };

            // normalise the document/t
            var encodings = classificationSet.Vectorise(true);

            // convert the sparse feature vectors into dense vectors
            var documentClusterList = new List <DocumentCluster>();

            using (var lap = Provider.CreateLinearAlgebra()) {
                var lookupTable = encodings
                                  .Select(d => Tuple.Create(d, lap.Create(d.Data).AsIndexable()))
                                  .ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification])
                ;
                var vectorList = lookupTable.Select(d => d.Key).ToList();

                Dispatcher.Invoke((SimpleDelegate) delegate() {
                    _statusMessage.Add("Clustering data...");
                });

                // cluster the dense vectors
                using (var nnmf = new NNMF(lap, vectorList, _clusterColour.Length)) {
                    var clusters = nnmf.Cluster(40, cost => {
                        Dispatcher.Invoke((SimpleDelegate) delegate() {
                            _statusMessage.Add("NNMF error: " + cost.ToString());
                        });
                    });

                    // create document clusters from the NNMF results
                    int index = 0;
                    foreach (var cluster in clusters)
                    {
                        var documentCluster = new List <AAAIDocument>();
                        foreach (var item in cluster)
                        {
                            var document = lookupTable[item];
                            documentCluster.Add(document);
                        }
                        var desc = String.Join(", ", nnmf.GetRankedFeatures(index++)
                                               .Select(i => stringTable.GetString(i))
                                               .Take(32)
                                               );
                        documentClusterList.Add(new DocumentCluster(documentCluster, desc));
                    }

                    // collect the cluster membership for each document
                    for (int i = 0, len = vectorList.Count; i < len; i++)
                    {
                        lookupTable[vectorList[i]].ClusterMembership = nnmf.GetClusterMembership(i);
                    }
                }
            }

            Dispatcher.Invoke((SimpleDelegate) delegate() {
                _UpdateUI(documentClusterList);
            });
        }