예제 #1
0
        public (Dictionary <string, int>, csr_matrix) _count_vocab(string[] raw_documents)
        {
            var vocabulary = new Dictionary <string, int>();

            var    analyze         = build_analyzer();
            int    feature_idx_all = 0;
            string doc             = String.Empty;

            var values    = new List <double>();
            var j_indices = new List <int>();
            var indptr    = new List <int>()
            {
                0
            };

            for (int i = 0; i < raw_documents.Length; i++)
            {
                doc = raw_documents[i];
                var feature_counter = new Dictionary <int, double>();

                foreach (string feature in analyze.analyze(doc))
                {
                    if (!vocabulary.ContainsKey(feature))
                    {
                        vocabulary[feature] = feature_idx_all++;
                    }

                    int feature_idx = vocabulary[feature];
                    if (feature_counter.ContainsKey(feature_idx))
                    {
                        feature_counter[feature_idx] += 1;
                    }
                    else
                    {
                        feature_counter[feature_idx] = 1;
                    }
                }

                j_indices.AddRange(feature_counter.Keys);
                values.AddRange(feature_counter.Values);
                indptr.Add(j_indices.Count);
            }

            vocabulary = vocabulary.OrderBy(x => x.Key).ToDictionary(x => x.Key, x => x.Value);

            var data1    = np.array(values.ToArray(), np.int32);
            var indices1 = np.array(j_indices.ToArray(), np.int32);
            var indptr1  = np.array(indptr.ToArray(), np.int32);

            var X = new csr_matrix(data1, indices1, indptr1,
                                   new Shape(indptr.Count - 1, vocabulary.Count),
                                   np.float64);

            X.sort_indices();

            vocabulary_ = vocabulary;

            return(vocabulary, X);
        }
예제 #2
0
        public static NDArray _document_frequency(csr_matrix X)
        {
            var dfs = X.indices.Data <int>().GroupBy(x => x)
                      .Select(x => new
            {
                key   = x.Key,
                total = x.Count()
            }).OrderBy(x => x.key).Select(x => x.total).ToArray();

            return(np.array(dfs));
        }
예제 #3
0
        public (csr_matrix, Dictionary <string, int>) _limit_features(csr_matrix X, Dictionary <string, int> vocabulary, int hight = -1, int low = -1, int limit = -1)
        {
            if (hight == -1 && low == -1 && limit == -1)
            {
                return(X, vocabulary);
            }

            var     dfs = TextHelper._document_frequency(X);
            NDArray tfs = null;// np.asarray(X.sum(axis: 0)).ravel();

            var mask = np.ones((Shape)dfs.Storage.Shape, dtype: typeof(bool));

            return(X, vocabulary);
        }
예제 #4
0
        public csr_matrix _sort_features(csr_matrix X, Dictionary <string, int> vocabulary)
        {
            var mapping = new Dictionary <int, int>();

            for (int i = 0; i < vocabulary.Count; i++)
            {
                var element = vocabulary.ElementAt(i);
                mapping[element.Value]  = i;
                vocabulary[element.Key] = i;
            }

            for (int i = 0; i < X.indices.size; i++)
            {
                X.indices.Data <int>()[i] = mapping[X.indices.Data <int>()[i]];
            }

            return(X);
        }
예제 #5
0
        public TfidfTransformer fit(csr_matrix X)
        {
            var(n_samples, n_features) = X.shape.BiShape;
            var df = TextHelper._document_frequency(X);

            // perform idf smoothing if required
            df        += smooth_idf ? 1 : 0;
            n_samples += smooth_idf ? 1 : 0;

            // log+1 instead of log makes sure terms with zero idf don't get suppressed entirely.

            var idf = np.log(n_samples / df) + 1;

            _idf_diag = sp.diags(new NDArray[] { idf },
                                 offsets: new int[] { 0 },
                                 shape: new Shape(n_features, n_features),
                                 format: "csr",
                                 dtype: np.float64);

            return(this);
        }
예제 #6
0
        public TfidfTransformer transform(csr_matrix X)
        {
            var(n_samples, n_features) = X.shape.BiShape;

            return(this);
        }