示例#1
0
        private void OutputTfIdf()
        {
            var writer   = new LargeFileWriter(vectorPath, FileMode.Create);
            int docLabel = 1;
            int num      = 0;

            ReadOneDoc();

            while (this.doc != null)
            {
                var tokenizer = TokenizerPool.GetTokenizer();

                var document = tokenizer.Tokenize(doc);
                TokenizerPool.ReturnTokenizer(tokenizer);
                if (++num % 1000 == 0)
                {
                    Console.WriteLine(num);
                }
                var vector = GetTfIdf(document);
                writer.Write(docLabel);
                foreach (var value in vector)
                {
                    writer.Write("\t" + value.first + ":" + value.second);
                }
                writer.Write("\r");
                ReadOneDoc();
                docLabel++;
            }
            writer.Close();
        }
示例#2
0
        private void AnalyzeCorpus()
        {
            //documents = new List<List<string>>();
            df        = new Dictionary <string, int>();
            wordTable = new Dictionary <string, int>();
            ReadOneDoc();
            HashSet <string> set = null;


            while (this.doc != null)
            {
                this.docNum++;
                if (this.docNum % 1000 == 0)
                {
                    Console.WriteLine(this.docNum);
                }
                var tokenizer = TokenizerPool.GetTokenizer();
                var document  = tokenizer.Tokenize(doc);
                TokenizerPool.ReturnTokenizer(tokenizer);
                set = new HashSet <string>(document);
                //documents.Add(document);
                foreach (var word in set)
                {
                    int times;
                    df.TryGetValue(word, out times);
                    df[word] = times + 1;
                    if (!wordTable.ContainsKey(word))
                    {
                        int count = wordTable.Count;
                        wordTable[word] = count;
                    }
                }
                ReadOneDoc();
            }
        }