Esempio n. 1
0
        public void Start()
        {
            var         reader = LuceneOperations.GetIndexReader(Configure.InputPath);
            var         sw     = new StreamWriter(Configure.OutputPath);
            IndexWriter writer = null;

            if (Configure.IsFilterByWordCount)
            {
                writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath);
            }
            if (Configure.IsLoadFromFeatureVector)
            {
                Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector;
            }

            Console.WriteLine("Total: " + reader.NumDocs());
            int docIndex = 0;

            for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
            {
                if (iDoc % 10000 == 0)
                {
                    Console.WriteLine(iDoc);
                    sw.Flush();
                }

                string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) :
                                 LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);

                List <string> words      = NLPOperations.Tokenize(content, Configure.TokenizeConfig);;
                bool          isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount;
                if (isPrintDoc)
                {
                    if (Configure.IsFilterByWordCount)
                    {
                        writer.AddDocument(reader.Document(iDoc));
                    }

                    sw.Write(docIndex + " " + docIndex + " ");

                    foreach (var word in words)
                    {
                        sw.Write(word + " ");
                    }
                    sw.Write("\n");

                    docIndex++;
                }
            }

            if (Configure.IsFilterByWordCount)
            {
                writer.Optimize();
                writer.Close();
            }

            sw.Flush();
            sw.Close();
            reader.Close();
        }
Esempio n. 2
0
        private SparseVectorList GetFeatureVector(Document doc, Dictionary <string, int> lexicon)
        {
            SparseVectorList featurevector = new SparseVectorList();

            int lexiconindexcount = lexicon.Count;

            var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
            var words   = NLPOperations.Tokenize(content, Configure.TokenizeConfig);

            foreach (var word in words)
            {
                int value = 0;
                if (lexicon == null || lexicon.TryGetValue(word, out value) == false)
                {
                    lexicon.Add(word, lexiconindexcount);
                    value = lexiconindexcount;
                    lexiconindexcount++;
                }
                if (!featurevector.Increase(value, 1))
                {
                    featurevector.Insert(value, 1);
                }
            }

            featurevector.ListToArray();
            featurevector.count = featurevector.keyarray.Length;
            //featurevector.SumUpValueArray();
            if (featurevector.count < 1)
            {
                return(null);
            }
            featurevector.InvalidateList();
            featurevector.GetNorm();
            return(featurevector);
        }
Esempio n. 3
0
        public void Start()
        {
            if (!Configure.InputPath.EndsWith("\\"))
            {
                Configure.InputPath += "\\";
            }
            var reader     = LuceneOperations.GetIndexReader(Configure.InputPath);
            var docNum     = reader.NumDocs();
            var docNumPart = docNum / 100;

            Console.WriteLine("Total: " + docNum);

            Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed);

            //Topwords
            var counter = new Counter <string>();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                if (iDoc % docNumPart == 0)
                {
                    Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                }
                if (random.NextDouble() > Configure.SampleRatio)
                {
                    continue;
                }

                var doc     = reader.Document(iDoc);
                var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                var words   = NLPOperations.Tokenize(content, Configure.TokenizeConfig);
                foreach (var word in words)
                {
                    counter.Add(word);
                }
            }
            var topwords        = counter.GetMostFreqObjs(Configure.TopWordCount);
            var wordCounterDict = counter.GetCountDictionary();

            var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt");

            foreach (var topword in topwords)
            {
                swTopWords.WriteLine(topword);
            }
            swTopWords.Flush();
            swTopWords.Close();

            //CoOccurrence
            if (Configure.IsPrintCooccurrence)
            {
                var k = topwords.Count;
                var occurCounterDict = new Dictionary <string, Counter <string> >();
                foreach (var topword in topwords)
                {
                    occurCounterDict.Add(topword, new Counter <string>());
                }
                for (int iDoc = 0; iDoc < docNum; iDoc++)
                {
                    if (iDoc % docNumPart == 0)
                    {
                        Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                    }
                    if (random.NextDouble() > Configure.SampleRatio)
                    {
                        continue;
                    }

                    var doc     = reader.Document(iDoc);
                    var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                    var words   = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig));
                    foreach (var word in words)
                    {
                        if (occurCounterDict.ContainsKey(word))
                        {
                            var occurCounter = occurCounterDict[word];
                            foreach (var word2 in words)
                            {
                                if (word2 == word)
                                {
                                    continue;
                                }
                                if (occurCounterDict.ContainsKey(word2))
                                {
                                    occurCounter.Add(word2);
                                }
                            }
                        }
                    }
                }
                var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount);
                var pairDict = new Dictionary <int, Tuple <string, string> >();
                var iPair    = 0;
                foreach (var kvp in occurCounterDict)
                {
                    var word         = kvp.Key;
                    var occurCounter = kvp.Value;
                    foreach (var kvp2 in occurCounter.GetCountDictionary())
                    {
                        heapSort.Insert(iPair, kvp2.Value);
                        pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key));
                        iPair++;
                    }
                }

                var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt");
                foreach (var kvp in heapSort.GetSortedDictionary())
                {
                    var pair = pairDict[kvp.Key];
                    swCoOccurrence.WriteLine("{0} - {1}\t{2}",
                                             pair.Item1, pair.Item2, kvp.Value);
                }

                swCoOccurrence.Flush();
                swCoOccurrence.Close();
            }

            reader.Close();
        }