コード例 #1
0
        public SparseVectorList GetFeatureVector(Document doc)
        {
            SparseVectorList featurevector = new SparseVectorList();

            int lexiconindexcount = Lexicon.Count;

            var content = LuceneOperations.GetDocumentContent(doc, _fieldWeightDict, _leadingSentencesCnt);
            var words   = NLPOperations.Tokenize(content, _tokenizeConfig);

            foreach (var word in words)
            {
                int value = 0;
                if (Lexicon == null || Lexicon.TryGetValue(word, out value) == false)
                {
                    Lexicon.Add(word, lexiconindexcount);
                    value = lexiconindexcount;
                    lexiconindexcount++;
                }
                if (!featurevector.Increase(value, 1))
                {
                    featurevector.Insert(value, 1);
                }
            }

            featurevector.ListToArray();
            featurevector.count = featurevector.keyarray.Length;
            //featurevector.SumUpValueArray();
            if (featurevector.count < 1)
            {
                return(null);
            }
            featurevector.InvalidateList();
            featurevector.GetNorm();
            return(featurevector);
        }
コード例 #2
0
        public static int EnumerateIndexReader(string inputPath, Action <Document> action)
        {
            var indexReader = LuceneOperations.GetIndexReader(inputPath);

            var             docNum   = indexReader.NumDocs();
            ProgramProgress progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                action(indexReader.Document(iDoc));
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            indexReader.Close();

            return(docNum);
        }