public SparseVectorList GetFeatureVector(Document doc) { SparseVectorList featurevector = new SparseVectorList(); int lexiconindexcount = Lexicon.Count; var content = LuceneOperations.GetDocumentContent(doc, _fieldWeightDict, _leadingSentencesCnt); var words = NLPOperations.Tokenize(content, _tokenizeConfig); foreach (var word in words) { int value = 0; if (Lexicon == null || Lexicon.TryGetValue(word, out value) == false) { Lexicon.Add(word, lexiconindexcount); value = lexiconindexcount; lexiconindexcount++; } if (!featurevector.Increase(value, 1)) { featurevector.Insert(value, 1); } } featurevector.ListToArray(); featurevector.count = featurevector.keyarray.Length; //featurevector.SumUpValueArray(); if (featurevector.count < 1) { return(null); } featurevector.InvalidateList(); featurevector.GetNorm(); return(featurevector); }
public static int EnumerateIndexReader(string inputPath, Action <Document> action) { var indexReader = LuceneOperations.GetIndexReader(inputPath); var docNum = indexReader.NumDocs(); ProgramProgress progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { action(indexReader.Document(iDoc)); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); indexReader.Close(); return(docNum); }