private void InitializeWriters() { if (Configure.IsSplitByTime) { _dateTransferFunc = str => { var dateTime = StringOperations.ParseDateTimeString(str, _dateFormatString); if (Configure.SplitDayCount == 7) { dateTime = dateTime.Subtract(TimeSpan.FromDays((int)dateTime.DayOfWeek)); } else { var days = dateTime.Subtract(_minDateTime).TotalDays; var residueDays = days % Configure.SplitDayCount; dateTime = dateTime.Subtract(TimeSpan.FromDays(residueDays)); } return(dateTime.ToString("yyyy-MM-dd")); }; } else { IndexWriter writer = LuceneOperations.GetIndexWriter(Configure.OutputPath); _writers.Add("", writer); } }
public void StartTransformTweetIndexForStreamingRoseRiver() { string inputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01_MOD\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); string docIDField = BingNewsFields.DocId; string urlField = BingNewsFields.DocumentURL; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = inDoc; outDoc.RemoveField(docIDField); outDoc.Add(new Field(docIDField, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.RemoveField(urlField); outDoc.Add(new Field(urlField, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(inDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }
public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var sw = new StreamWriter(Configure.OutputPath); IndexWriter writer = null; if (Configure.IsFilterByWordCount) { writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath); } if (Configure.IsLoadFromFeatureVector) { Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector; } Console.WriteLine("Total: " + reader.NumDocs()); int docIndex = 0; for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (iDoc % 10000 == 0) { Console.WriteLine(iDoc); sw.Flush(); } string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) : LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); List <string> words = NLPOperations.Tokenize(content, Configure.TokenizeConfig);; bool isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount; if (isPrintDoc) { if (Configure.IsFilterByWordCount) { writer.AddDocument(reader.Document(iDoc)); } sw.Write(docIndex + " " + docIndex + " "); foreach (var word in words) { sw.Write(word + " "); } sw.Write("\n"); docIndex++; } } if (Configure.IsFilterByWordCount) { writer.Optimize(); writer.Close(); } sw.Flush(); sw.Close(); reader.Close(); }
public void Start() { var writer = LuceneOperations.GetIndexWriter(OutputPath); var totalDocCnt = 0; foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); totalDocCnt += reader.NumDocs(); reader.Close(); } var progress = new ProgramProgress(totalDocCnt); foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { writer.AddDocument(reader.Document(iDoc)); progress.PrintIncrementExperiment(); } reader.Close(); } writer.Optimize(); writer.Close(); }
/// <summary> /// Twitter data: from cosmos, each line represents a Tweet. /// Different fields are seperated by '\t'. The schema is the name for each field /// </summary> private void BuildFromTwitterTxt() { string inputpath = TwitterConfigure.InputPath; string outputpath = TwitterConfigure.OutputPath; var schema = TwitterConfigure.TwitterSchema; string bodyField = TwitterConfigure.TwitterBodyField; var indexwriter = LuceneOperations.GetIndexWriter(outputpath); StreamReader sr = new StreamReader(inputpath); string line; int lineCnt = 0; while ((line = sr.ReadLine()) != null) { lineCnt++; } //Console.WriteLine("Total Lines: " + lineCnt); sr.Close(); sr = new StreamReader(inputpath); var seperator = new char[] { '\t' }; int lineIndex = 0; var progress = new ProgramProgress(lineCnt); while ((line = sr.ReadLine()) != null) { //if (lineIndex % 100000 == 0) // Console.WriteLine("{0} out of {1} ({2}%)", lineIndex, lineCnt, 100 * lineIndex / lineCnt); var tokens = line.Split(seperator);//, StringSplitOptions.RemoveEmptyEntries); if (tokens.Length != schema.Length) { throw new Exception("Unmatch schema"); } var document = new Document(); for (int i = 0; i < tokens.Length; i++) { if (schema[i] == bodyField) { tokens[i] = RemoveContentNoise.RemoveTweetIndexNoise(tokens[i]); } document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexwriter.AddDocument(document); lineIndex++; progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); sr.Close(); indexwriter.Optimize(); indexwriter.Close(); }
public void TransformWithFileNameContentSearch(string[] files, string indexPath, string searchStr, string progressEndStr = null) { double tweetCnt = 0; var indexWriter = LuceneOperations.GetIndexWriter(indexPath); searchStr = searchStr.ToLower(); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } if (tweet.main.ToLower().Contains(searchStr)) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%) -- {3}", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount, progressEndStr)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); //Util.ProgramFinishHalt(); }
public void Start() { string debugFileName = Configure.OutputPath + _debugFileName; if (File.Exists(debugFileName)) { File.Delete(debugFileName); } var reader = LuceneOperations.GetIndexReader(Configure.InputPath); List <int> docIDs = new List <int>(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { docIDs.Add(iDoc); } if (Configure.IsRemoveSameURL) { Console.WriteLine("=====================RemoveSameURL====================="); docIDs = RemoveSameURLDocument(reader, docIDs); } if (Configure.IsRemoveSimilarContent) { for (int iGranu = 0; iGranu < Configure.RemoveDateGranularity.Length; iGranu++) { int timeGranu = Configure.RemoveDateGranularity[iGranu]; int wordGranu = Configure.RemoveWordGranularity[iGranu]; Console.WriteLine("========Remove Similar Document: {0} out of {1}, Granu: {2} {3}========", iGranu, Configure.RemoveDateGranularity.Length, timeGranu, wordGranu); docIDs = RemoveSimilarDocumentsGranu(reader, docIDs, timeGranu, wordGranu); } } var writer = LuceneOperations.GetIndexWriter(Configure.OutputPath); foreach (var docID in docIDs) { writer.AddDocument(reader.Document(docID)); } writer.Optimize(); writer.Close(); reader.Close(); Console.WriteLine("All done"); //Console.ReadKey(); }
private IndexWriter GetWriter(Document doc) { if (!Configure.IsSplitByTime) { return(_writers.Values.First()); } else { var dateTime = StringOperations.ParseDateTimeString(doc.Get(Configure.TimeField), Configure.ParseTimeFormat); string projDate = _dateTransferFunc(dateTime.ToString(_dateFormatString)); IndexWriter writer; if (!_writers.TryGetValue(projDate, out writer)) { string path = StringOperations.EnsureFolderEnd(Configure.OutputPath) + projDate; writer = LuceneOperations.GetIndexWriter(path); _writers[projDate] = writer; } return(writer); } }
private void BuildFromWeiboWebPages() { var indexWriter = LuceneOperations.GetIndexWriter(WeiboConfigure.OutputPath); //int totalWeiboCount = 0; //int totalFileCount = 0; foreach (var filename in Directory.EnumerateFiles(WeiboConfigure.InputPath, "*.txt", SearchOption.AllDirectories)) { if (Path.GetFileName(filename).StartsWith("_")) { continue; } var parser = new WeiboParser(filename); foreach (var weibo in parser.GetContainedWeibo()) { Document doc = new Document(); doc.Add(new Field(WeiboLuceneFields.UserNickName, weibo.UserNickName, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.UserID, weibo.UserID, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.NewsArticleDescription, weibo.Content, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.DiscoveryStringTime, weibo.Time, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.Source, weibo.Source, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.UpCount, weibo.UpCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.ForwardCount, weibo.ForwardCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.CollectCount, weibo.CollectCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.ReplyCount, weibo.ReplyCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(doc); } //Console.WriteLine(filename); //var cnt = parser.GetContainedWeibo().Count; //Console.WriteLine(cnt); //totalWeiboCount += cnt; //totalFileCount++; } //Console.WriteLine("Total count:" + totalWeiboCount); //Console.WriteLine("Total file count: " + totalFileCount); indexWriter.Optimize(); indexWriter.Close(); }
private void BuildFromBingNewsXMLs() { string[] selectedFields = new string[] { "DocumentURL", "DocumentUrl", "Country", "NewsArticleCategoryData", "NewsArticleHeadline", "NewsArticleDescription", "DiscoveryStringTime", "PublishedDateTime", "DownloadStringTime", "PublishedDateTime", "NewsSource" }; //NewsArticleBodyNEMap, RealTimeType List <string> bingnewspaths = BingNewsConfigure.BingNewsPaths; int iProcessor = BingNewsConfigure.iProcessor; int processorNum = BingNewsConfigure.ProcessorNum; string startdate = BingNewsConfigure.StartDate; string enddate = BingNewsConfigure.EndDate; List <string[]> keywordLists = BingNewsConfigure.KeywordLists; List <string> indexpaths = BingNewsConfigure.IndexPaths; List <string> languages = BingNewsConfigure.Languages; int maxThreadNum = BingNewsConfigure.MaxThreadNum; //LoadExtractBingNewsDataConfig_KeyWordList(out bingnewspaths, // out iProcessor, out processorNum, out startdate, out enddate, // out keywordLists, out languages, out indexpaths); List <string> outputdirs = new List <string>(); List <string> infofilenames = new List <string>(); int ikeyword2 = 0; foreach (string indexpath in indexpaths) { string outputdir = indexpath + "BingNews_" + keywordLists[ikeyword2][0] + "_" + iProcessor + "_" + processorNum; if (!Directory.Exists(outputdir)) { Directory.CreateDirectory(outputdir); } infofilenames.Add(indexpath + "BingNews_" + keywordLists[ikeyword2][0] + "_" + iProcessor + "_" + processorNum + ".dat"); outputdirs.Add(outputdir); ikeyword2++; } List <IndexWriter> indexwriters = new List <IndexWriter>(); List <StreamWriter> infofiles = new List <StreamWriter>(); for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { IndexWriter indexwriter = LuceneOperations.GetIndexWriter(outputdirs[ikeyword2]); StreamWriter infofile = new StreamWriter(infofilenames[ikeyword2]); indexwriters.Add(indexwriter); infofiles.Add(infofile); } List <string> allfilenames = new List <string>(); foreach (var bingnewpath in bingnewspaths) { allfilenames.AddRange(Directory.GetFiles(bingnewpath, "*.*", System.IO.SearchOption.AllDirectories)); } allfilenames = FilterDates(allfilenames, startdate, enddate).ToList(); List <string> filenames = new List <string>(); for (int i = iProcessor; i < allfilenames.Count; i += processorNum) { filenames.Add(allfilenames[i]); } Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); ProgramProgress progress = new ProgramProgress(filenames.Count); //ProgramProgress progress = new ProgramProgress(filenames.Count / processorNum); int[] newsfoundcnts = new int[keywordLists.Count]; DateTime time_begin_1 = DateTime.Now; //for (int ifilename = iProcessor; ifilename < filenames.Count; ifilename += processorNum) if (maxThreadNum == 1) { foreach (var filename in filenames) { BuildLuceneFromFile(filename, keywordLists, indexwriters, languages, selectedFields, newsfoundcnts, infofiles, progress); } } else { ParallelOptions options = new ParallelOptions(); options.MaxDegreeOfParallelism = maxThreadNum; object obj = new Object(); Parallel.ForEach(filenames, options, filename => BuildLuceneFromFile(filename, keywordLists, indexwriters, languages, selectedFields, newsfoundcnts, infofiles, progress)); } for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { infofiles[ikeyword2].WriteLine("Extract xml time\t" + stopwatch.Elapsed); } Console.WriteLine("Start writing to lucene index..."); Stopwatch stopwatch2 = new Stopwatch(); stopwatch2.Start(); for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { indexwriters[ikeyword2].Optimize(); indexwriters[ikeyword2].Close(); } for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { infofiles[ikeyword2].WriteLine("Write to lucene index time\t" + stopwatch2.Elapsed); infofiles[ikeyword2].WriteLine("Total time\t" + stopwatch.Elapsed); infofiles[ikeyword2].Flush(); infofiles[ikeyword2].Close(); } }
public void Transform(string inputFolder, string indexPath, HashSet <string> keywords) { Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords)); Console.WriteLine("InputFolder: " + inputFolder + "\n"); string notParseSpecString = "Temp-DoNotParse"; inputFolder = StringOperations.EnsureFolderEnd(inputFolder); string[] schema = new[] { "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount", "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount" }; var schemeDict = Util.GetInvertedDictionary(schema); var textFieldIndex = schemeDict["Text"]; var createdTimeFieldIndex = schemeDict["CreatedAt"]; var userIdFieldIndex = schemeDict["UserId"]; //string outputPath = inputFolder + notParseSpecString + "\\"; //if (Directory.Exists(outputPath)) //{ // Directory.Delete(outputPath, true); //} //Directory.CreateDirectory(outputPath); //var indexPath = outputPath + "Index\\"; if (Directory.Exists(indexPath)) { Directory.Delete(indexPath, true); } var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories); //Preprocess Console.WriteLine("Start preprocesing..."); ProgramProgress progress = new ProgramProgress(files.Length); int estiDocCnt = 0; foreach (var file in files) { estiDocCnt += FileOperations.GetLineCount(file); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n"); //Parse Console.WriteLine("Start parsing..."); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); progress = new ProgramProgress(estiDocCnt); var sep = new char[] { '\t' }; int uniqDocFoundCnt = 0; int docFoundCnt = 0; int docCnt = 0; ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>(); int notUsedDocCnt = 0; foreach (var file in files) { if (file.Contains(notParseSpecString)) { continue; } if (file.EndsWith(".txt")) { var sr = new StreamReader(file); string line; while ((line = sr.ReadLine()) != null) { var tokens = line.Split(sep, StringSplitOptions.None); if (tokens.Length != schema.Length) { notUsedDocCnt++; continue; //throw new ArgumentException(); } var words = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig); bool isContainSearch = false; foreach (var word in words) { if (keywords.Contains(word)) { isContainSearch = true; break; } } if (isContainSearch) { string createdAt = tokens[createdTimeFieldIndex]; long userId = long.Parse(tokens[userIdFieldIndex]); string text = tokens[textFieldIndex]; if (!hash3Layer.Contains(createdAt, userId, text)) { var document = new Document(); for (int i = 0; i < schema.Length; i++) { document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(document); hash3Layer.Add(createdAt, userId, text); uniqDocFoundCnt++; } docFoundCnt++; } docCnt++; progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt)))); } sr.Close(); } } progress.PrintTotalTime(); Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt)); Console.WriteLine("Not used doc count: " + notUsedDocCnt); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Console.WriteLine("Finish"); Console.ReadKey(); }
public void Start() { if (!outputpath.EndsWith("\\")) { outputpath += "\\"; } var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr); var searcher = LuceneOperations.GetIndexSearcher(inputpath); var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs()); var scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); int iter = 0; bool bContinue = threshold == 0 ? false : true; while (bContinue && iter < 5) { iter++; Console.WriteLine("iteration------------------" + iter); List <string> keywordsNew; #region Calculate Keywords var counter = new Counter <string>(); foreach (var scoredDoc in scoredDocs) { var doc = searcher.Doc(scoredDoc.doc); var content = doc.Get(searchfield); foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig)) { counter.Add(word); } } keywordsNew = counter.GetMostFreqObjs(keywordNum); #endregion var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num); #region Test whether exit int repeatNum = 0; var docIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { docIDs.Add(scoredDoc.doc); } foreach (var scoredDocNew in scoredDocsNew) { if (docIDs.Contains(scoredDocNew.doc)) { repeatNum++; } } bContinue = (double)repeatNum / scoredDocs.Length < threshold; #endregion Console.WriteLine(repeatNum + " " + scoredDocsNew.Length); keywords = keywordsNew; scoredDocs = scoredDocsNew; Console.WriteLine(StringOperations.GetMergedString(keywords)); } max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs()); scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); var writer = LuceneOperations.GetIndexWriter(outputpath); foreach (var scoredDoc in scoredDocs) { Document doc = searcher.Doc(scoredDoc.doc); writer.AddDocument(doc); } writer.Optimize(); writer.Close(); if (isPrintRemovedDocuments) { var sw = new StreamWriter(outputpath + "removeDocuments.txt"); var selectedDocIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { selectedDocIDs.Add(scoredDoc.doc); } var reader = searcher.GetIndexReader(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (!selectedDocIDs.Contains(iDoc)) { sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc))); } } reader.Close(); sw.Flush(); sw.Close(); } searcher.Close(); Console.WriteLine("Done"); Console.ReadKey(); }
public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType) { double tweetCnt = 0; TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } bool isContainSearch = false; switch (searchType) { case SearchSpinn3rType.Main: var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig); foreach (var word in words) { if (searchHashSet.Contains(word)) { isContainSearch = true; break; } } break; case SearchSpinn3rType.User: isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower()); break; default: throw new ArgumentException(); } if (isContainSearch) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Util.ProgramFinishHalt(); }
public void Start() { string inputPath = @"D:\DataProcess\TweetIndex\tweets-Ebola-20150101-20150228_dedup\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter2\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); char[] seperator = new char[] { ' ' }; string[] aidFields = new string[] { "User_FollowersCount", "User_Name", "User_ScreenName", "Retweet", "Mention" }; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); //for (int iDoc = 0; iDoc < 1000; iDoc++) for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = new Document(); string inTime = inDoc.Get("CreateAt"); DateTime dateTime = DateTime.Parse(inTime); outDoc.Add(new Field(BingNewsFields.DiscoveryStringTime, dateTime.ToString(BingNewsFields.TimeFormat), Field.Store.YES, Field.Index.ANALYZED)); string hashtag = inDoc.Get("Hashtag"); string word = inDoc.Get("Word"); if (hashtag == null) { hashtag = ""; } var hashtagTokens = hashtag.Split(seperator, StringSplitOptions.RemoveEmptyEntries); var wordTokens = word.Split(seperator, StringSplitOptions.RemoveEmptyEntries); string title = hashtagTokens.Length > 0 ? hashtagTokens[0] : wordTokens.Length > 0 ? wordTokens[0] : ""; outDoc.Add(new Field(BingNewsFields.NewsArticleHeadline, title, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.NewsArticleDescription, inDoc.Get("Text"), Field.Store.YES, Field.Index.ANALYZED)); string featureVector = ""; Counter <string> counter = new Counter <string>(); foreach (var tag in hashtagTokens) { counter.Add(tag); counter.Add(tag); } foreach (var w in wordTokens) { counter.Add(w); } foreach (var kvp in counter.GetSortedCountDictioanry()) { featureVector += string.Format("{0}({1})\\n", kvp.Key, kvp.Value); } outDoc.Add(new Field(BingNewsFields.FeatureVector, featureVector, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocId, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocumentURL, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); foreach (var aidField in aidFields) { var value = inDoc.Get(aidField); outDoc.Add(new Field(aidField, value == null ? "" : value, Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(outDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }