private SparseVectorList GetFeatureVector(Document doc, Dictionary <string, int> lexicon) { SparseVectorList featurevector = new SparseVectorList(); int lexiconindexcount = lexicon.Count; var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = NLPOperations.Tokenize(content, Configure.TokenizeConfig); foreach (var word in words) { int value = 0; if (lexicon == null || lexicon.TryGetValue(word, out value) == false) { lexicon.Add(word, lexiconindexcount); value = lexiconindexcount; lexiconindexcount++; } if (!featurevector.Increase(value, 1)) { featurevector.Insert(value, 1); } } featurevector.ListToArray(); featurevector.count = featurevector.keyarray.Length; //featurevector.SumUpValueArray(); if (featurevector.count < 1) { return(null); } featurevector.InvalidateList(); featurevector.GetNorm(); return(featurevector); }
public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var sw = new StreamWriter(Configure.OutputPath); IndexWriter writer = null; if (Configure.IsFilterByWordCount) { writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath); } if (Configure.IsLoadFromFeatureVector) { Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector; } Console.WriteLine("Total: " + reader.NumDocs()); int docIndex = 0; for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (iDoc % 10000 == 0) { Console.WriteLine(iDoc); sw.Flush(); } string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) : LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); List <string> words = NLPOperations.Tokenize(content, Configure.TokenizeConfig);; bool isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount; if (isPrintDoc) { if (Configure.IsFilterByWordCount) { writer.AddDocument(reader.Document(iDoc)); } sw.Write(docIndex + " " + docIndex + " "); foreach (var word in words) { sw.Write(word + " "); } sw.Write("\n"); docIndex++; } } if (Configure.IsFilterByWordCount) { writer.Optimize(); writer.Close(); } sw.Flush(); sw.Close(); reader.Close(); }
private List <int> GetHashtagNumberInappropriateDocuments(IndexReader indexreader, string outputfile) { Console.WriteLine("==========Remove inappropriate hashtag number documents!=========="); StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null; List <int> removedDocuments = new List <int>(); int docNum = indexreader.NumDocs(); string bodyfield = this.BodyField; int removedDocNum = 0; Console.WriteLine("Total documents: {0}", docNum); var tokenConfig = new TokenizeConfig(TokenizerType.Hashtag, StopWordsFile.NO); for (int idoc = 0; idoc < docNum; idoc++) { if (idoc % 10000 == 0) { if (idoc == 0) { continue; } Console.WriteLine("Process " + idoc + "th document!"); Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc); if (IsPrintTextFiles) { sw.Flush(); } } Document document = indexreader.Document(idoc); string content = document.Get(bodyfield); var words = NLPOperations.Tokenize(content, tokenConfig); if (words.Count < MinHashtagNumber || words.Count > MaxHashtagNumber) { if (IsPrintTextFiles) { sw.WriteLine(DocumentToString(document)); } removedDocuments.Add(idoc); removedDocNum++; } } Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); if (IsPrintTextFiles) { sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); sw.Flush(); sw.Close(); } return(removedDocuments); }
public void Parse(TokenizeConfig tokenizeConfig) { var text = Title + " " + Title + " " + Title + " " + Body; var words = NLPOperations.Tokenize(text, tokenizeConfig); int wordIndex = 0; Counter <int> counter = new Counter <int>(); foreach (var word in words) { if (!KmeansLexicon.Word2IndexDict.TryGetValue(word, out wordIndex)) { wordIndex = KmeansLexicon.Word2IndexDict.Count; KmeansLexicon.Word2IndexDict.Add(word, wordIndex); KmeansLexicon.Index2WordDict.Add(wordIndex, word); } counter.Add(wordIndex); } Vector = SortUtils.EnsureSortedByKey(counter.GetCountDictionary().ToDictionary(kvp2 => kvp2.Key, kvp2 => (double)kvp2.Value)); Norm = Maths.GetVectorLength(Vector); Vector = Maths.GetVectorMultiply(Vector, 1.0 / Norm); Norm = 1; _orgVector = Vector; }
public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig) { var indexReader = LuceneOperations.GetIndexReader(inputPath); var docNum = indexReader.NumDocs(); int[] docWordCnt = new int[docNum]; int[] docUniqWordCnt = new int[docNum]; Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>(); Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>(); var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector ? BingNewsFields.FeatureVectorFieldWeights : BingNewsFields.NewsFieldWeights; ProgramProgress progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var document = indexReader.Document(iDoc); var content = LuceneOperations.GetContent(document, fieldWeights); var words = NLPOperations.Tokenize(content, tokenConfig); var uniqueWords = new HashSet <string>(words); docWordCnt[iDoc] = words.Count; docUniqWordCnt[iDoc] = uniqueWords.Count; foreach (var word in uniqueWords) { if (!wordDocCntDict.ContainsKey(word)) { wordDocCntDict.Add(word, 0); } wordDocCntDict[word]++; } foreach (var word in words) { if (!wordOccCntDict.ContainsKey(word)) { wordOccCntDict.Add(word, 0); } wordOccCntDict[word]++; } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); indexReader.Close(); //Statistics DoubleStatistics statDocWordCnt = new DoubleStatistics(); DoubleStatistics statDocUniqWordCnt = new DoubleStatistics(); DoubleStatistics statWordDocCnt = new DoubleStatistics(); DoubleStatistics statWordOccCnt = new DoubleStatistics(); for (int iDoc = 0; iDoc < docNum; iDoc++) { statDocWordCnt.AddNumber(docWordCnt[iDoc]); statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]); } foreach (var kvp in wordDocCntDict) { statWordDocCnt.AddNumber(kvp.Value); } foreach (var kvp in wordOccCntDict) { statWordOccCnt.AddNumber(kvp.Value); } Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt")); Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt")); Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt")); Console.WriteLine(statWordOccCnt.ToString("wordOccCnt")); //Hist var docWordCntHist = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1); var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1); var wordDocCntHist = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000); var wordDocCntHist2 = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1); docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv"); docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv"); wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv"); wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv"); Console.Read(); }
public void Transform(string inputFolder, string indexPath, HashSet <string> keywords) { Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords)); Console.WriteLine("InputFolder: " + inputFolder + "\n"); string notParseSpecString = "Temp-DoNotParse"; inputFolder = StringOperations.EnsureFolderEnd(inputFolder); string[] schema = new[] { "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount", "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount" }; var schemeDict = Util.GetInvertedDictionary(schema); var textFieldIndex = schemeDict["Text"]; var createdTimeFieldIndex = schemeDict["CreatedAt"]; var userIdFieldIndex = schemeDict["UserId"]; //string outputPath = inputFolder + notParseSpecString + "\\"; //if (Directory.Exists(outputPath)) //{ // Directory.Delete(outputPath, true); //} //Directory.CreateDirectory(outputPath); //var indexPath = outputPath + "Index\\"; if (Directory.Exists(indexPath)) { Directory.Delete(indexPath, true); } var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories); //Preprocess Console.WriteLine("Start preprocesing..."); ProgramProgress progress = new ProgramProgress(files.Length); int estiDocCnt = 0; foreach (var file in files) { estiDocCnt += FileOperations.GetLineCount(file); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n"); //Parse Console.WriteLine("Start parsing..."); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); progress = new ProgramProgress(estiDocCnt); var sep = new char[] { '\t' }; int uniqDocFoundCnt = 0; int docFoundCnt = 0; int docCnt = 0; ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>(); int notUsedDocCnt = 0; foreach (var file in files) { if (file.Contains(notParseSpecString)) { continue; } if (file.EndsWith(".txt")) { var sr = new StreamReader(file); string line; while ((line = sr.ReadLine()) != null) { var tokens = line.Split(sep, StringSplitOptions.None); if (tokens.Length != schema.Length) { notUsedDocCnt++; continue; //throw new ArgumentException(); } var words = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig); bool isContainSearch = false; foreach (var word in words) { if (keywords.Contains(word)) { isContainSearch = true; break; } } if (isContainSearch) { string createdAt = tokens[createdTimeFieldIndex]; long userId = long.Parse(tokens[userIdFieldIndex]); string text = tokens[textFieldIndex]; if (!hash3Layer.Contains(createdAt, userId, text)) { var document = new Document(); for (int i = 0; i < schema.Length; i++) { document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(document); hash3Layer.Add(createdAt, userId, text); uniqDocFoundCnt++; } docFoundCnt++; } docCnt++; progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt)))); } sr.Close(); } } progress.PrintTotalTime(); Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt)); Console.WriteLine("Not used doc count: " + notUsedDocCnt); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Console.WriteLine("Finish"); Console.ReadKey(); }
public void Start() { if (!outputpath.EndsWith("\\")) { outputpath += "\\"; } var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr); var searcher = LuceneOperations.GetIndexSearcher(inputpath); var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs()); var scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); int iter = 0; bool bContinue = threshold == 0 ? false : true; while (bContinue && iter < 5) { iter++; Console.WriteLine("iteration------------------" + iter); List <string> keywordsNew; #region Calculate Keywords var counter = new Counter <string>(); foreach (var scoredDoc in scoredDocs) { var doc = searcher.Doc(scoredDoc.doc); var content = doc.Get(searchfield); foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig)) { counter.Add(word); } } keywordsNew = counter.GetMostFreqObjs(keywordNum); #endregion var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num); #region Test whether exit int repeatNum = 0; var docIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { docIDs.Add(scoredDoc.doc); } foreach (var scoredDocNew in scoredDocsNew) { if (docIDs.Contains(scoredDocNew.doc)) { repeatNum++; } } bContinue = (double)repeatNum / scoredDocs.Length < threshold; #endregion Console.WriteLine(repeatNum + " " + scoredDocsNew.Length); keywords = keywordsNew; scoredDocs = scoredDocsNew; Console.WriteLine(StringOperations.GetMergedString(keywords)); } max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs()); scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); var writer = LuceneOperations.GetIndexWriter(outputpath); foreach (var scoredDoc in scoredDocs) { Document doc = searcher.Doc(scoredDoc.doc); writer.AddDocument(doc); } writer.Optimize(); writer.Close(); if (isPrintRemovedDocuments) { var sw = new StreamWriter(outputpath + "removeDocuments.txt"); var selectedDocIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { selectedDocIDs.Add(scoredDoc.doc); } var reader = searcher.GetIndexReader(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (!selectedDocIDs.Contains(iDoc)) { sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc))); } } reader.Close(); sw.Flush(); sw.Close(); } searcher.Close(); Console.WriteLine("Done"); Console.ReadKey(); }
public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType) { double tweetCnt = 0; TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } bool isContainSearch = false; switch (searchType) { case SearchSpinn3rType.Main: var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig); foreach (var word in words) { if (searchHashSet.Contains(word)) { isContainSearch = true; break; } } break; case SearchSpinn3rType.User: isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower()); break; default: throw new ArgumentException(); } if (isContainSearch) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Util.ProgramFinishHalt(); }
public void Start() { if (!Configure.InputPath.EndsWith("\\")) { Configure.InputPath += "\\"; } var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var docNum = reader.NumDocs(); var docNumPart = docNum / 100; Console.WriteLine("Total: " + docNum); Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed); //Topwords var counter = new Counter <string>(); for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = NLPOperations.Tokenize(content, Configure.TokenizeConfig); foreach (var word in words) { counter.Add(word); } } var topwords = counter.GetMostFreqObjs(Configure.TopWordCount); var wordCounterDict = counter.GetCountDictionary(); var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt"); foreach (var topword in topwords) { swTopWords.WriteLine(topword); } swTopWords.Flush(); swTopWords.Close(); //CoOccurrence if (Configure.IsPrintCooccurrence) { var k = topwords.Count; var occurCounterDict = new Dictionary <string, Counter <string> >(); foreach (var topword in topwords) { occurCounterDict.Add(topword, new Counter <string>()); } for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig)); foreach (var word in words) { if (occurCounterDict.ContainsKey(word)) { var occurCounter = occurCounterDict[word]; foreach (var word2 in words) { if (word2 == word) { continue; } if (occurCounterDict.ContainsKey(word2)) { occurCounter.Add(word2); } } } } } var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount); var pairDict = new Dictionary <int, Tuple <string, string> >(); var iPair = 0; foreach (var kvp in occurCounterDict) { var word = kvp.Key; var occurCounter = kvp.Value; foreach (var kvp2 in occurCounter.GetCountDictionary()) { heapSort.Insert(iPair, kvp2.Value); pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key)); iPair++; } } var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt"); foreach (var kvp in heapSort.GetSortedDictionary()) { var pair = pairDict[kvp.Key]; swCoOccurrence.WriteLine("{0} - {1}\t{2}", pair.Item1, pair.Item2, kvp.Value); } swCoOccurrence.Flush(); swCoOccurrence.Close(); } reader.Close(); }
public static void Test() { string indexPath = @"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\WeiboSortByHotIndex_Time_RemoveNoise2_RemoveSimilar2"; var reader = LuceneOperations.GetIndexReader(indexPath); //var keywords = new string[]{"街","信","死","女","清","刷","骂","愿","爱","查","舰","版","通","岁","撕"}; //foreach (var keyword in keywords) { var sw = new StreamWriter(@"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\TestTokenizer" + "Stat" + ".txt", false, Encoding.UTF8); //ChineseWordBreaker chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\"); int cnt1 = 0, cnt2 = 0; int cnt1all = 0, cnt2all = 0; for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { string sentence = reader.Document(iDoc).Get("NewsArticleDescription"); var words1 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ICTCLAS, StopWordsFile.CH)); var words2 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ChineseWordBreaker, StopWordsFile.CH)); //bool isPrint = false; //foreach (var word in words1) // if (word.Length == 1) // { // isPrint = true; // cnt1++; // } //foreach (var word in words2) // if (word.Length == 2) // { // isPrint = true; // cnt2++; // } cnt1all += words1.Count; cnt2all += words2.Count; //if (isPrint) //{ // sw.WriteLine("-------------{0}-------------", iDoc); // sw.WriteLine(sentence); // sw.WriteLine("[ICT]\t" + StringOperations.GetMergedString(words1)); // sw.WriteLine("[CWB]\t" + StringOperations.GetMergedString(words2)); // sw.WriteLine("[ICT--]\t" + Marshal.PtrToStringAnsi(NLPIR_ParagraphProcess(sentence, 1))); // //sw.WriteLine("[CWB--]\t" + chineseWordBreaker.GetResult(sentence)); // sw.WriteLine(); // sw.Flush(); //} } sw.WriteLine("cnt1 = " + cnt1); sw.WriteLine("cnt2 = " + cnt2); sw.WriteLine("cnt1all = " + cnt1all); sw.WriteLine("cnt2all = " + cnt2all); sw.Flush(); sw.Close(); } }
List <int> GetLanguageErrorDocuments(IndexReader indexreader, string outputfile) { Console.WriteLine("==========Remove language error documents!=========="); StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null; List <int> removedDocuments = new List <int>(); var stopWords = IsEnglish ? FileOperations.LoadKeyWordFile(StopWordsFile.EN, true) : FileOperations.LoadKeyWordFile(StopWordsFile.CH, false); var stopHash = Util.GetHashSet(stopWords); int docNum = indexreader.NumDocs(); string titlefield = this.TitleField; string bodyfield = this.BodyField; int removedDocNum = 0; Console.WriteLine("Total documents: {0}", docNum); var tokenConfig = new TokenizeConfig(IsEnglish ? TokenizerType.Standard : TokenizerType.ICTCLAS, StopWordsFile.NO); DoubleStatistics stat_percent = new DoubleStatistics(); DoubleStatistics stat_absolute = new DoubleStatistics(); for (int idoc = 0; idoc < docNum; idoc++) { if (idoc % 10000 == 0) { if (idoc == 0) { continue; } Console.WriteLine("Process " + idoc + "th document!"); Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc); if (IsPrintTextFiles) { sw.Flush(); } } Document document = indexreader.Document(idoc); string content = document.Get(titlefield) + " " + document.Get(bodyfield); if (IsEnglish) { content = content.ToLower(); } var words = NLPOperations.Tokenize(content, tokenConfig); var termCnt0 = words.Count; var termCnt1 = 0; foreach (var word in words) { if (!stopHash.Contains(word)) { termCnt1++; } } if (((double)termCnt0 - termCnt1) / termCnt0 < MinLanguageCorrectRatio) { if (IsPrintTextFiles) { sw.WriteLine(DocumentToString(document)); } removedDocuments.Add(idoc); removedDocNum++; } else { stat_absolute.AddNumber(termCnt0 - termCnt1); stat_percent.AddNumber((100.0) * (termCnt0 - termCnt1) / termCnt0); } } Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); if (IsPrintTextFiles) { sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); sw.Flush(); sw.Close(); } Console.WriteLine(stat_percent.ToString("stat_percent")); Console.WriteLine(stat_absolute.ToString("stat_absolute")); return(removedDocuments); }