private List <int> GetHashtagNumberInappropriateDocuments(IndexReader indexreader, string outputfile) { Console.WriteLine("==========Remove inappropriate hashtag number documents!=========="); StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null; List <int> removedDocuments = new List <int>(); int docNum = indexreader.NumDocs(); string bodyfield = this.BodyField; int removedDocNum = 0; Console.WriteLine("Total documents: {0}", docNum); var tokenConfig = new TokenizeConfig(TokenizerType.Hashtag, StopWordsFile.NO); for (int idoc = 0; idoc < docNum; idoc++) { if (idoc % 10000 == 0) { if (idoc == 0) { continue; } Console.WriteLine("Process " + idoc + "th document!"); Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc); if (IsPrintTextFiles) { sw.Flush(); } } Document document = indexreader.Document(idoc); string content = document.Get(bodyfield); var words = NLPOperations.Tokenize(content, tokenConfig); if (words.Count < MinHashtagNumber || words.Count > MaxHashtagNumber) { if (IsPrintTextFiles) { sw.WriteLine(DocumentToString(document)); } removedDocuments.Add(idoc); removedDocNum++; } } Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); if (IsPrintTextFiles) { sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); sw.Flush(); sw.Close(); } return(removedDocuments); }
public WordOccurrence(string inputPath, TokenizeConfig tokenizeConfig, Dictionary <string, int> FieldWeightDict, int TopWordCount = 100, int TopOccurrenceCount = 1000) { Configure = new WordOccurrenceConfigure(); Configure.InputPath = inputPath; Configure.TokenizeConfig = tokenizeConfig; Configure.TopWordCount = TopWordCount; Configure.TopOccurrenceCount = TopOccurrenceCount; Configure.FieldWeightDict = FieldWeightDict; }
public RemoveNoiseByRules() { var config = FileOperations.LoadConfigure("configRemoveNoiseByRules.txt"); InputPath = config["InputPath"][0]; OutputPath = config["OutputPath"][0]; IsRemoveShortDocuments = bool.Parse(config["IsRemoveShortDocuments"][0]); TokenizeConfig = new TokenizeConfig(config["TokenizeConfig"][0]); MinLongDocumentsWordCount = int.Parse(config["MinLongDocumentsWordCount"][0]); IsRemoveByHashtagNumber = bool.Parse(config["IsRemoveByHashtagNumber"][0]); MinHashtagNumber = int.Parse(config["MinHashtagNumber"][0]); MaxHashtagNumber = int.Parse(config["MaxHashtagNumber"][0]); IsRemoveLanguageError = bool.Parse(config["IsRemoveLanguageError"][0]); IsEnglish = bool.Parse(config["IsEnglish"][0]); MinLanguageCorrectRatio = double.Parse(config["MinLanguageCorrectRatio"][0]); IsRemoveDocumentsWithCertainWords = bool.Parse(config["IsRemoveDocumentsWithCertainWords"][0]); NoisyWords = config["NoisyWords"][0].Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); NoisyWordFilterCount = int.Parse(config["NoisyWordFilterCount"][0]); IsRemoveLeadingParagraphNoKeywords = bool.Parse(config["IsRemoveLeadingParagraphNoKeywords"][0]); IsSearchKeywordBruteForce = bool.Parse(config["IsSearchKeywordBruteForce"][0]); Keywords = config["Keywords"][0].Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); LeadingParaSentenseNum = int.Parse(config["LeadingParaSentenseNum"][0]); TitlePassNumber = int.Parse(config["TitlePassNumber"][0]); LeadingPassNumber = int.Parse(config["LeadingPassNumber"][0]); BodyPassNumber = int.Parse(config["BodyPassNumber"][0]); //remove yahoonews IsRemoveYahooNews = bool.Parse(config["IsRemoveYahooNews"][0]); //For all IsCaseSensitive = bool.Parse(config["IsCaseSensitive"][0]); IsPrintTextFiles = bool.Parse(config["IsPrintTextFiles"][0]); TitleField = config["TitleField"][0]; BodyField = config["BodyField"][0]; DateField = config["DateField"][0]; SourceField = config["SourceField"][0]; URLField = config["URLField"][0]; }
public void Parse(TokenizeConfig tokenizeConfig) { var text = Title + " " + Title + " " + Title + " " + Body; var words = NLPOperations.Tokenize(text, tokenizeConfig); int wordIndex = 0; Counter <int> counter = new Counter <int>(); foreach (var word in words) { if (!KmeansLexicon.Word2IndexDict.TryGetValue(word, out wordIndex)) { wordIndex = KmeansLexicon.Word2IndexDict.Count; KmeansLexicon.Word2IndexDict.Add(word, wordIndex); KmeansLexicon.Index2WordDict.Add(wordIndex, word); } counter.Add(wordIndex); } Vector = SortUtils.EnsureSortedByKey(counter.GetCountDictionary().ToDictionary(kvp2 => kvp2.Key, kvp2 => (double)kvp2.Value)); Norm = Maths.GetVectorLength(Vector); Vector = Maths.GetVectorMultiply(Vector, 1.0 / Norm); Norm = 1; _orgVector = Vector; }
public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig) { var indexReader = LuceneOperations.GetIndexReader(inputPath); var docNum = indexReader.NumDocs(); int[] docWordCnt = new int[docNum]; int[] docUniqWordCnt = new int[docNum]; Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>(); Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>(); var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector ? BingNewsFields.FeatureVectorFieldWeights : BingNewsFields.NewsFieldWeights; ProgramProgress progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var document = indexReader.Document(iDoc); var content = LuceneOperations.GetContent(document, fieldWeights); var words = NLPOperations.Tokenize(content, tokenConfig); var uniqueWords = new HashSet <string>(words); docWordCnt[iDoc] = words.Count; docUniqWordCnt[iDoc] = uniqueWords.Count; foreach (var word in uniqueWords) { if (!wordDocCntDict.ContainsKey(word)) { wordDocCntDict.Add(word, 0); } wordDocCntDict[word]++; } foreach (var word in words) { if (!wordOccCntDict.ContainsKey(word)) { wordOccCntDict.Add(word, 0); } wordOccCntDict[word]++; } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); indexReader.Close(); //Statistics DoubleStatistics statDocWordCnt = new DoubleStatistics(); DoubleStatistics statDocUniqWordCnt = new DoubleStatistics(); DoubleStatistics statWordDocCnt = new DoubleStatistics(); DoubleStatistics statWordOccCnt = new DoubleStatistics(); for (int iDoc = 0; iDoc < docNum; iDoc++) { statDocWordCnt.AddNumber(docWordCnt[iDoc]); statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]); } foreach (var kvp in wordDocCntDict) { statWordDocCnt.AddNumber(kvp.Value); } foreach (var kvp in wordOccCntDict) { statWordOccCnt.AddNumber(kvp.Value); } Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt")); Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt")); Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt")); Console.WriteLine(statWordOccCnt.ToString("wordOccCnt")); //Hist var docWordCntHist = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1); var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1); var wordDocCntHist = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000); var wordDocCntHist2 = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1); docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv"); docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv"); wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv"); wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv"); Console.Read(); }
public void Transform(string inputFolder, string indexPath, HashSet <string> keywords) { Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords)); Console.WriteLine("InputFolder: " + inputFolder + "\n"); string notParseSpecString = "Temp-DoNotParse"; inputFolder = StringOperations.EnsureFolderEnd(inputFolder); string[] schema = new[] { "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount", "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount" }; var schemeDict = Util.GetInvertedDictionary(schema); var textFieldIndex = schemeDict["Text"]; var createdTimeFieldIndex = schemeDict["CreatedAt"]; var userIdFieldIndex = schemeDict["UserId"]; //string outputPath = inputFolder + notParseSpecString + "\\"; //if (Directory.Exists(outputPath)) //{ // Directory.Delete(outputPath, true); //} //Directory.CreateDirectory(outputPath); //var indexPath = outputPath + "Index\\"; if (Directory.Exists(indexPath)) { Directory.Delete(indexPath, true); } var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories); //Preprocess Console.WriteLine("Start preprocesing..."); ProgramProgress progress = new ProgramProgress(files.Length); int estiDocCnt = 0; foreach (var file in files) { estiDocCnt += FileOperations.GetLineCount(file); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n"); //Parse Console.WriteLine("Start parsing..."); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); progress = new ProgramProgress(estiDocCnt); var sep = new char[] { '\t' }; int uniqDocFoundCnt = 0; int docFoundCnt = 0; int docCnt = 0; ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>(); int notUsedDocCnt = 0; foreach (var file in files) { if (file.Contains(notParseSpecString)) { continue; } if (file.EndsWith(".txt")) { var sr = new StreamReader(file); string line; while ((line = sr.ReadLine()) != null) { var tokens = line.Split(sep, StringSplitOptions.None); if (tokens.Length != schema.Length) { notUsedDocCnt++; continue; //throw new ArgumentException(); } var words = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig); bool isContainSearch = false; foreach (var word in words) { if (keywords.Contains(word)) { isContainSearch = true; break; } } if (isContainSearch) { string createdAt = tokens[createdTimeFieldIndex]; long userId = long.Parse(tokens[userIdFieldIndex]); string text = tokens[textFieldIndex]; if (!hash3Layer.Contains(createdAt, userId, text)) { var document = new Document(); for (int i = 0; i < schema.Length; i++) { document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(document); hash3Layer.Add(createdAt, userId, text); uniqDocFoundCnt++; } docFoundCnt++; } docCnt++; progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt)))); } sr.Close(); } } progress.PrintTotalTime(); Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt)); Console.WriteLine("Not used doc count: " + notUsedDocCnt); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Console.WriteLine("Finish"); Console.ReadKey(); }
public void Start() { if (!outputpath.EndsWith("\\")) { outputpath += "\\"; } var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr); var searcher = LuceneOperations.GetIndexSearcher(inputpath); var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs()); var scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); int iter = 0; bool bContinue = threshold == 0 ? false : true; while (bContinue && iter < 5) { iter++; Console.WriteLine("iteration------------------" + iter); List <string> keywordsNew; #region Calculate Keywords var counter = new Counter <string>(); foreach (var scoredDoc in scoredDocs) { var doc = searcher.Doc(scoredDoc.doc); var content = doc.Get(searchfield); foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig)) { counter.Add(word); } } keywordsNew = counter.GetMostFreqObjs(keywordNum); #endregion var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num); #region Test whether exit int repeatNum = 0; var docIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { docIDs.Add(scoredDoc.doc); } foreach (var scoredDocNew in scoredDocsNew) { if (docIDs.Contains(scoredDocNew.doc)) { repeatNum++; } } bContinue = (double)repeatNum / scoredDocs.Length < threshold; #endregion Console.WriteLine(repeatNum + " " + scoredDocsNew.Length); keywords = keywordsNew; scoredDocs = scoredDocsNew; Console.WriteLine(StringOperations.GetMergedString(keywords)); } max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs()); scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); var writer = LuceneOperations.GetIndexWriter(outputpath); foreach (var scoredDoc in scoredDocs) { Document doc = searcher.Doc(scoredDoc.doc); writer.AddDocument(doc); } writer.Optimize(); writer.Close(); if (isPrintRemovedDocuments) { var sw = new StreamWriter(outputpath + "removeDocuments.txt"); var selectedDocIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { selectedDocIDs.Add(scoredDoc.doc); } var reader = searcher.GetIndexReader(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (!selectedDocIDs.Contains(iDoc)) { sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc))); } } reader.Close(); sw.Flush(); sw.Close(); } searcher.Close(); Console.WriteLine("Done"); Console.ReadKey(); }
public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType) { double tweetCnt = 0; TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } bool isContainSearch = false; switch (searchType) { case SearchSpinn3rType.Main: var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig); foreach (var word in words) { if (searchHashSet.Contains(word)) { isContainSearch = true; break; } } break; case SearchSpinn3rType.User: isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower()); break; default: throw new ArgumentException(); } if (isContainSearch) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Util.ProgramFinishHalt(); }
List <int> GetLanguageErrorDocuments(IndexReader indexreader, string outputfile) { Console.WriteLine("==========Remove language error documents!=========="); StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null; List <int> removedDocuments = new List <int>(); var stopWords = IsEnglish ? FileOperations.LoadKeyWordFile(StopWordsFile.EN, true) : FileOperations.LoadKeyWordFile(StopWordsFile.CH, false); var stopHash = Util.GetHashSet(stopWords); int docNum = indexreader.NumDocs(); string titlefield = this.TitleField; string bodyfield = this.BodyField; int removedDocNum = 0; Console.WriteLine("Total documents: {0}", docNum); var tokenConfig = new TokenizeConfig(IsEnglish ? TokenizerType.Standard : TokenizerType.ICTCLAS, StopWordsFile.NO); DoubleStatistics stat_percent = new DoubleStatistics(); DoubleStatistics stat_absolute = new DoubleStatistics(); for (int idoc = 0; idoc < docNum; idoc++) { if (idoc % 10000 == 0) { if (idoc == 0) { continue; } Console.WriteLine("Process " + idoc + "th document!"); Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc); if (IsPrintTextFiles) { sw.Flush(); } } Document document = indexreader.Document(idoc); string content = document.Get(titlefield) + " " + document.Get(bodyfield); if (IsEnglish) { content = content.ToLower(); } var words = NLPOperations.Tokenize(content, tokenConfig); var termCnt0 = words.Count; var termCnt1 = 0; foreach (var word in words) { if (!stopHash.Contains(word)) { termCnt1++; } } if (((double)termCnt0 - termCnt1) / termCnt0 < MinLanguageCorrectRatio) { if (IsPrintTextFiles) { sw.WriteLine(DocumentToString(document)); } removedDocuments.Add(idoc); removedDocNum++; } else { stat_absolute.AddNumber(termCnt0 - termCnt1); stat_percent.AddNumber((100.0) * (termCnt0 - termCnt1) / termCnt0); } } Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); if (IsPrintTextFiles) { sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); sw.Flush(); sw.Close(); } Console.WriteLine(stat_percent.ToString("stat_percent")); Console.WriteLine(stat_absolute.ToString("stat_absolute")); return(removedDocuments); }