public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var sw = new StreamWriter(Configure.OutputPath); IndexWriter writer = null; if (Configure.IsFilterByWordCount) { writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath); } if (Configure.IsLoadFromFeatureVector) { Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector; } Console.WriteLine("Total: " + reader.NumDocs()); int docIndex = 0; for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (iDoc % 10000 == 0) { Console.WriteLine(iDoc); sw.Flush(); } string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) : LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); List <string> words = NLPOperations.Tokenize(content, Configure.TokenizeConfig);; bool isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount; if (isPrintDoc) { if (Configure.IsFilterByWordCount) { writer.AddDocument(reader.Document(iDoc)); } sw.Write(docIndex + " " + docIndex + " "); foreach (var word in words) { sw.Write(word + " "); } sw.Write("\n"); docIndex++; } } if (Configure.IsFilterByWordCount) { writer.Optimize(); writer.Close(); } sw.Flush(); sw.Close(); reader.Close(); }
private SparseVectorList GetFeatureVector(Document doc, Dictionary <string, int> lexicon) { SparseVectorList featurevector = new SparseVectorList(); int lexiconindexcount = lexicon.Count; var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = NLPOperations.Tokenize(content, Configure.TokenizeConfig); foreach (var word in words) { int value = 0; if (lexicon == null || lexicon.TryGetValue(word, out value) == false) { lexicon.Add(word, lexiconindexcount); value = lexiconindexcount; lexiconindexcount++; } if (!featurevector.Increase(value, 1)) { featurevector.Insert(value, 1); } } featurevector.ListToArray(); featurevector.count = featurevector.keyarray.Length; //featurevector.SumUpValueArray(); if (featurevector.count < 1) { return(null); } featurevector.InvalidateList(); featurevector.GetNorm(); return(featurevector); }
public void Start() { if (!Configure.InputPath.EndsWith("\\")) { Configure.InputPath += "\\"; } var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var docNum = reader.NumDocs(); var docNumPart = docNum / 100; Console.WriteLine("Total: " + docNum); Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed); //Topwords var counter = new Counter <string>(); for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = NLPOperations.Tokenize(content, Configure.TokenizeConfig); foreach (var word in words) { counter.Add(word); } } var topwords = counter.GetMostFreqObjs(Configure.TopWordCount); var wordCounterDict = counter.GetCountDictionary(); var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt"); foreach (var topword in topwords) { swTopWords.WriteLine(topword); } swTopWords.Flush(); swTopWords.Close(); //CoOccurrence if (Configure.IsPrintCooccurrence) { var k = topwords.Count; var occurCounterDict = new Dictionary <string, Counter <string> >(); foreach (var topword in topwords) { occurCounterDict.Add(topword, new Counter <string>()); } for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig)); foreach (var word in words) { if (occurCounterDict.ContainsKey(word)) { var occurCounter = occurCounterDict[word]; foreach (var word2 in words) { if (word2 == word) { continue; } if (occurCounterDict.ContainsKey(word2)) { occurCounter.Add(word2); } } } } } var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount); var pairDict = new Dictionary <int, Tuple <string, string> >(); var iPair = 0; foreach (var kvp in occurCounterDict) { var word = kvp.Key; var occurCounter = kvp.Value; foreach (var kvp2 in occurCounter.GetCountDictionary()) { heapSort.Insert(iPair, kvp2.Value); pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key)); iPair++; } } var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt"); foreach (var kvp in heapSort.GetSortedDictionary()) { var pair = pairDict[kvp.Key]; swCoOccurrence.WriteLine("{0} - {1}\t{2}", pair.Item1, pair.Item2, kvp.Value); } swCoOccurrence.Flush(); swCoOccurrence.Close(); } reader.Close(); }