private void OutputTfIdf() { var writer = new LargeFileWriter(vectorPath, FileMode.Create); int docLabel = 1; int num = 0; ReadOneDoc(); while (this.doc != null) { var tokenizer = TokenizerPool.GetTokenizer(); var document = tokenizer.Tokenize(doc); TokenizerPool.ReturnTokenizer(tokenizer); if (++num % 1000 == 0) { Console.WriteLine(num); } var vector = GetTfIdf(document); writer.Write(docLabel); foreach (var value in vector) { writer.Write("\t" + value.first + ":" + value.second); } writer.Write("\r"); ReadOneDoc(); docLabel++; } writer.Close(); }
private void AnalyzeCorpus() { //documents = new List<List<string>>(); df = new Dictionary <string, int>(); wordTable = new Dictionary <string, int>(); ReadOneDoc(); HashSet <string> set = null; while (this.doc != null) { this.docNum++; if (this.docNum % 1000 == 0) { Console.WriteLine(this.docNum); } var tokenizer = TokenizerPool.GetTokenizer(); var document = tokenizer.Tokenize(doc); TokenizerPool.ReturnTokenizer(tokenizer); set = new HashSet <string>(document); //documents.Add(document); foreach (var word in set) { int times; df.TryGetValue(word, out times); df[word] = times + 1; if (!wordTable.ContainsKey(word)) { int count = wordTable.Count; wordTable[word] = count; } } ReadOneDoc(); } }