public void StartTransformTweetIndexForStreamingRoseRiver() { string inputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01_MOD\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); string docIDField = BingNewsFields.DocId; string urlField = BingNewsFields.DocumentURL; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = inDoc; outDoc.RemoveField(docIDField); outDoc.Add(new Field(docIDField, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.RemoveField(urlField); outDoc.Add(new Field(urlField, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(inDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }
private void InitializeWriters() { if (Configure.IsSplitByTime) { _dateTransferFunc = str => { var dateTime = StringOperations.ParseDateTimeString(str, _dateFormatString); if (Configure.SplitDayCount == 7) { dateTime = dateTime.Subtract(TimeSpan.FromDays((int)dateTime.DayOfWeek)); } else { var days = dateTime.Subtract(_minDateTime).TotalDays; var residueDays = days % Configure.SplitDayCount; dateTime = dateTime.Subtract(TimeSpan.FromDays(residueDays)); } return(dateTime.ToString("yyyy-MM-dd")); }; } else { IndexWriter writer = LuceneOperations.GetIndexWriter(Configure.OutputPath); _writers.Add("", writer); } }
/// <summary> /// Filter out tweets within a certain time range /// Output: *.filter.txt /// </summary> /// <param name="lucenePath">Lucene index folder path of tweets</param> /// <param name="fileName">Input file path and prefix of output file</param> /// <param name="minTimeStr">Lower bound of time range</param> /// <param name="maxTimeStr">Upper bound of time range</param> public static void filterTimeRange(string lucenePath, string fileName, string minTimeStr, string maxTimeStr) { var indexReader = LuceneOperations.GetIndexReader(lucenePath); StreamReader sr = new StreamReader(fileName, Encoding.Default); FileStream fs = new FileStream(fileName + ".filter.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); string line; while ((line = sr.ReadLine()) != null) { int iDoc = int.Parse(line); Document inDoc = indexReader.Document(iDoc); string timeStr = inDoc.Get("CreatedAt"); DateTime time = DateTime.Parse(timeStr); DateTime minTime = DateTime.Parse(minTimeStr); DateTime maxTime = DateTime.Parse(maxTimeStr); if (DateTime.Compare(time, minTime) > 0 && DateTime.Compare(time, maxTime) < 0) { sw.WriteLine(iDoc); } } sw.Close(); fs.Close(); sr.Close(); }
public static void AnalyzeFieldValues(string inputPath, string fieldName, Func <string, string> convertValueFunc = null) { if (convertValueFunc == null) { convertValueFunc = str => str; } string fileName = StringOperations.EnsureFolderEnd(inputPath) + fieldName + ".txt"; StreamWriter sw = new StreamWriter(fileName); Counter <string> counter = new Counter <string>(); var indexReader = LuceneOperations.GetIndexReader(inputPath); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { var doc = indexReader.Document(iDoc); var value = doc.Get(fieldName); counter.Add(convertValueFunc(value)); } foreach (var kvp in counter.GetCountDictionary().OrderBy(kvp => kvp.Key)) { sw.WriteLine(kvp.Key + "\t\t" + kvp.Value); Console.WriteLine(kvp.Key + "\t\t" + kvp.Value); } sw.WriteLine("total: " + indexReader.NumDocs()); sw.Flush(); sw.Close(); indexReader.Close(); Console.ReadKey(); }
public void Start() { var writer = LuceneOperations.GetIndexWriter(OutputPath); var totalDocCnt = 0; foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); totalDocCnt += reader.NumDocs(); reader.Close(); } var progress = new ProgramProgress(totalDocCnt); foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { writer.AddDocument(reader.Document(iDoc)); progress.PrintIncrementExperiment(); } reader.Close(); } writer.Optimize(); writer.Close(); }
public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var sw = new StreamWriter(Configure.OutputPath); IndexWriter writer = null; if (Configure.IsFilterByWordCount) { writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath); } if (Configure.IsLoadFromFeatureVector) { Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector; } Console.WriteLine("Total: " + reader.NumDocs()); int docIndex = 0; for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (iDoc % 10000 == 0) { Console.WriteLine(iDoc); sw.Flush(); } string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) : LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); List <string> words = NLPOperations.Tokenize(content, Configure.TokenizeConfig);; bool isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount; if (isPrintDoc) { if (Configure.IsFilterByWordCount) { writer.AddDocument(reader.Document(iDoc)); } sw.Write(docIndex + " " + docIndex + " "); foreach (var word in words) { sw.Write(word + " "); } sw.Write("\n"); docIndex++; } } if (Configure.IsFilterByWordCount) { writer.Optimize(); writer.Close(); } sw.Flush(); sw.Close(); reader.Close(); }
private SparseVectorList GetFeatureVector(Document doc, Dictionary <string, int> lexicon) { SparseVectorList featurevector = new SparseVectorList(); int lexiconindexcount = lexicon.Count; var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = NLPOperations.Tokenize(content, Configure.TokenizeConfig); foreach (var word in words) { int value = 0; if (lexicon == null || lexicon.TryGetValue(word, out value) == false) { lexicon.Add(word, lexiconindexcount); value = lexiconindexcount; lexiconindexcount++; } if (!featurevector.Increase(value, 1)) { featurevector.Insert(value, 1); } } featurevector.ListToArray(); featurevector.count = featurevector.keyarray.Length; //featurevector.SumUpValueArray(); if (featurevector.count < 1) { return(null); } featurevector.InvalidateList(); featurevector.GetNorm(); return(featurevector); }
/// <summary> /// Calculate the average published time of each tweet cluster /// Output: clusterAverageTime.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void averageTime(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); StreamReader sr1 = new StreamReader("generalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterAverageTime.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); string line; string line1; while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null) { line = sr.ReadLine(); line1 = sr1.ReadLine(); sr.ReadLine(); sr1.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } string[] iDocStrArray1 = Regex.Split(line1, " "); List <int> iDocList1 = new List <int>(); for (int i = 0; i < iDocStrArray1.Length - 1; i++) { iDocList1.Add(int.Parse(iDocStrArray1[i])); } int count = iDocList.Count + iDocList1.Count; double temp = 0.0; for (int i = 0; i < iDocList.Count; i++) { Document inDoc = indexReader.Document(iDocList[i]); string timeStr = inDoc.Get("CreatedAt"); DateTime time = DateTime.Parse(timeStr); temp += (double)time.Ticks / count; } for (int i = 0; i < iDocList1.Count; i++) { Document inDoc = indexReader.Document(iDocList1[i]); string timeStr = inDoc.Get("CreatedAt"); DateTime time = DateTime.Parse(timeStr); temp += (double)time.Ticks / count; } DateTime timeAvg = new DateTime((long)temp); sw.WriteLine(timeAvg.ToString()); } sw.Close(); fs.Close(); sr1.Close(); sr.Close(); }
/// <summary> /// Twitter data: from cosmos, each line represents a Tweet. /// Different fields are seperated by '\t'. The schema is the name for each field /// </summary> private void BuildFromTwitterTxt() { string inputpath = TwitterConfigure.InputPath; string outputpath = TwitterConfigure.OutputPath; var schema = TwitterConfigure.TwitterSchema; string bodyField = TwitterConfigure.TwitterBodyField; var indexwriter = LuceneOperations.GetIndexWriter(outputpath); StreamReader sr = new StreamReader(inputpath); string line; int lineCnt = 0; while ((line = sr.ReadLine()) != null) { lineCnt++; } //Console.WriteLine("Total Lines: " + lineCnt); sr.Close(); sr = new StreamReader(inputpath); var seperator = new char[] { '\t' }; int lineIndex = 0; var progress = new ProgramProgress(lineCnt); while ((line = sr.ReadLine()) != null) { //if (lineIndex % 100000 == 0) // Console.WriteLine("{0} out of {1} ({2}%)", lineIndex, lineCnt, 100 * lineIndex / lineCnt); var tokens = line.Split(seperator);//, StringSplitOptions.RemoveEmptyEntries); if (tokens.Length != schema.Length) { throw new Exception("Unmatch schema"); } var document = new Document(); for (int i = 0; i < tokens.Length; i++) { if (schema[i] == bodyField) { tokens[i] = RemoveContentNoise.RemoveTweetIndexNoise(tokens[i]); } document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexwriter.AddDocument(document); lineIndex++; progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); sr.Close(); indexwriter.Optimize(); indexwriter.Close(); }
public void TransformWithFileNameContentSearch(string[] files, string indexPath, string searchStr, string progressEndStr = null) { double tweetCnt = 0; var indexWriter = LuceneOperations.GetIndexWriter(indexPath); searchStr = searchStr.ToLower(); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } if (tweet.main.ToLower().Contains(searchStr)) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%) -- {3}", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount, progressEndStr)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); //Util.ProgramFinishHalt(); }
/// <summary> /// Extract the unigrams, bigrams and trigrams of signal tweets. /// Need executing method MatchSignal.match_ori() first. /// Preparing step for signal tweets clustering method cluster_ori(). /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> /// <param name="gramsList">List of unigrams, bigrams and trigrams of signal tweets</param> /// <param name="rec2iDoc">Dictionary from 3-grams record list # to tweet ID #</param> /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list #</param> public static void preCluster_ori(string fileName, List <List <HashSet <string> > > gramsList, Dictionary <int, int> rec2iDoc, Dictionary <int, int> iDoc2rec) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signal.txt", Encoding.Default); string line; int recNum = 0; while ((line = sr.ReadLine()) != null) { int iDoc = int.Parse(line); Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", ""); string[] gramArray = Regex.Split(text, " "); List <HashSet <string> > grams = new List <HashSet <string> >(); HashSet <string> unigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length; i++) { unigram.Add(gramArray[i]); } grams.Add(unigram); HashSet <string> bigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length - 1; i++) { bigram.Add(gramArray[i] + " " + gramArray[i + 1]); } grams.Add(bigram); HashSet <string> trigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length - 2; i++) { trigram.Add(gramArray[i] + " " + gramArray[i + 1] + " " + gramArray[i + 2]); } grams.Add(trigram); if (recNum % 1000 == 0) { Console.WriteLine(recNum); } gramsList.Add(grams); rec2iDoc.Add(recNum, iDoc); iDoc2rec.Add(iDoc, recNum); recNum++; } sr.Close(); }
public static void VisualizeTree(IEnumerable <string> brtFiles, string luceneIndex = null, string[] keywords = null, bool isRemoveLeafNodes = true) { List <ITree> trees = new List <ITree>(); foreach (var brtFile in brtFiles) { //Read tree from file TreeDataParser parser = new TreeDataParser(brtFile, isRemoveLeafNodes); var tree = parser.GetTree(); Trace.WriteLine(tree.GetDepth(tree.Root)); if (luceneIndex != null) { var scheme = TreeNodeScheme.Get(tree.Graph.NodeTable); scheme.SetIndexReader(LuceneOperations.GetIndexReader(luceneIndex)); scheme.SetBRTFileName(brtFile); } trees.Add(tree); } //Print analyze info DoubleStatistics depthStat = new DoubleStatistics(); DoubleStatistics internalNodeStat = new DoubleStatistics(); foreach (var tree in trees) { depthStat.AddNumber(tree.BFS(tree.Root).Max(node => { int depth = 0; INode ancestor = node; while ((ancestor = tree.GetParent(ancestor)) != null) { depth++; } return(depth); }) + 1); internalNodeStat.AddNumber(tree.BFS(tree.Root).Count()); } Console.WriteLine(depthStat.ToString()); Console.WriteLine(internalNodeStat.ToString()); //Visualize tree Thread NetServer = new Thread(new ThreadStart(() => { TreeVisualization treeVis = new TreeVisualization(trees, keywords); })); NetServer.SetApartmentState(ApartmentState.STA); NetServer.IsBackground = true; NetServer.Start(); System.Windows.Threading.Dispatcher.Run(); }
public void Start() { string debugFileName = Configure.OutputPath + _debugFileName; if (File.Exists(debugFileName)) { File.Delete(debugFileName); } var reader = LuceneOperations.GetIndexReader(Configure.InputPath); List <int> docIDs = new List <int>(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { docIDs.Add(iDoc); } if (Configure.IsRemoveSameURL) { Console.WriteLine("=====================RemoveSameURL====================="); docIDs = RemoveSameURLDocument(reader, docIDs); } if (Configure.IsRemoveSimilarContent) { for (int iGranu = 0; iGranu < Configure.RemoveDateGranularity.Length; iGranu++) { int timeGranu = Configure.RemoveDateGranularity[iGranu]; int wordGranu = Configure.RemoveWordGranularity[iGranu]; Console.WriteLine("========Remove Similar Document: {0} out of {1}, Granu: {2} {3}========", iGranu, Configure.RemoveDateGranularity.Length, timeGranu, wordGranu); docIDs = RemoveSimilarDocumentsGranu(reader, docIDs, timeGranu, wordGranu); } } var writer = LuceneOperations.GetIndexWriter(Configure.OutputPath); foreach (var docID in docIDs) { writer.AddDocument(reader.Document(docID)); } writer.Optimize(); writer.Close(); reader.Close(); Console.WriteLine("All done"); //Console.ReadKey(); }
/// <summary> /// Match rumor patterns to find signal tweets /// Preparing step for method ClusterSignal.preCluster_ori() /// Output: signal.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void match_ori(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); FileStream fs = new FileStream("signal.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); if (Regex.IsMatch(text, @"is (this|that|it) true")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(^|[^A-Za-z] )wh(a*)t([\?!]+)")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(real\?|really\?|unconfirmed)")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(rumor|debunk)")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(that|this|it) is not true")) { sw.WriteLine(iDoc); continue; } if (iDoc % 100000 == 0) { Console.WriteLine(iDoc); } } sw.Close(); fs.Close(); }
public void AnalyzeDocuments() { string fileName = @"D:\Project\TopicPanorama\data\TopicGraphs\NewCode-Ebola-Test2\Raw\news\result\lda.top.json"; string indexPath = @"D:\DataProcess\Index\Raw_EbolaEnBingNews_Ebola_0_1_RS_R-1"; int topDocCnt = 20; var indexReader = LuceneOperations.GetIndexReader(indexPath); //Read from json and sort SimpleJsonReader reader = new SimpleJsonReader(new StreamReader(File.Open(fileName, FileMode.Open))); HeapSortDouble[] hsd = null; int topicNumber = -1; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); while (reader.IsReadable) { int docID = int.Parse(reader.ReadPropertyName()); double[] topicArray = reader.ReadDoubleArray(); if (topicNumber < 0) { topicNumber = topicArray.Length; hsd = new HeapSortDouble[topicNumber]; for (int i = 0; i < topicNumber; i++) { hsd[i] = new HeapSortDouble(topDocCnt); } } for (int i = 0; i < topicNumber; i++) { hsd[i].Insert(docID, topicArray[i]); } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); //Statistics Console.ReadLine(); }
private IndexWriter GetWriter(Document doc) { if (!Configure.IsSplitByTime) { return(_writers.Values.First()); } else { var dateTime = StringOperations.ParseDateTimeString(doc.Get(Configure.TimeField), Configure.ParseTimeFormat); string projDate = _dateTransferFunc(dateTime.ToString(_dateFormatString)); IndexWriter writer; if (!_writers.TryGetValue(projDate, out writer)) { string path = StringOperations.EnsureFolderEnd(Configure.OutputPath) + projDate; writer = LuceneOperations.GetIndexWriter(path); _writers[projDate] = writer; } return(writer); } }
private void BuildFromWeiboWebPages() { var indexWriter = LuceneOperations.GetIndexWriter(WeiboConfigure.OutputPath); //int totalWeiboCount = 0; //int totalFileCount = 0; foreach (var filename in Directory.EnumerateFiles(WeiboConfigure.InputPath, "*.txt", SearchOption.AllDirectories)) { if (Path.GetFileName(filename).StartsWith("_")) { continue; } var parser = new WeiboParser(filename); foreach (var weibo in parser.GetContainedWeibo()) { Document doc = new Document(); doc.Add(new Field(WeiboLuceneFields.UserNickName, weibo.UserNickName, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.UserID, weibo.UserID, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.NewsArticleDescription, weibo.Content, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.DiscoveryStringTime, weibo.Time, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.Source, weibo.Source, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.UpCount, weibo.UpCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.ForwardCount, weibo.ForwardCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.CollectCount, weibo.CollectCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(WeiboLuceneFields.ReplyCount, weibo.ReplyCount.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(doc); } //Console.WriteLine(filename); //var cnt = parser.GetContainedWeibo().Count; //Console.WriteLine(cnt); //totalWeiboCount += cnt; //totalFileCount++; } //Console.WriteLine("Total count:" + totalWeiboCount); //Console.WriteLine("Total file count: " + totalFileCount); indexWriter.Optimize(); indexWriter.Close(); }
/// <summary> /// Output representative tweet text of each tweet cluster /// Need executing selectRepresentative() first /// Output: clusterRepOriginalText.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void ouputRepresentativeOriginalText(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("clusterRepIDoc.txt", Encoding.Default); FileStream fs = new FileStream("clusterRepOriginalText.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); string line; while ((line = sr.ReadLine()) != null) { Document inDoc = indexReader.Document(int.Parse(line)); string text = inDoc.Get("Text"); text = Regex.Replace(text, @"#N#", ""); text = Regex.Replace(text, @"#n#", ""); text = Regex.Replace(text, @"\s+", " "); sw.WriteLine(text); } sw.Close(); fs.Close(); }
//public LuceneIndexTransform() //{ //} public void Transform(string inputPath, string outputPath, Dictionary <string, string> fieldNameTransformDictionary, Dictionary <string, Func <string, string> > fieldValueTransformDictionary = null, Func <Document, bool> documentPredicate = null) { if (fieldValueTransformDictionary == null) { fieldValueTransformDictionary = new Dictionary <string, Func <string, string> >(); } if (documentPredicate == null) { documentPredicate = document => true; } Func <string, string> defaultValueTransformFunc = str => str; LuceneOperations.EnumerateIndexReaderWriter(inputPath, outputPath, (inDoc, indexWriter) => { if (documentPredicate(inDoc)) { var outDoc = new Document(); foreach (var kvp in fieldNameTransformDictionary) { var inFieldName = kvp.Key; var inValue = inDoc.Get(inFieldName); if (inValue != null) { var outFieldName = kvp.Value; Func <string, string> valueTransformFunc; if (!fieldValueTransformDictionary.TryGetValue(inFieldName, out valueTransformFunc)) { valueTransformFunc = defaultValueTransformFunc; } LuceneOperations.AddField(outDoc, outFieldName, valueTransformFunc(inValue)); } } indexWriter.AddDocument(outDoc); } }); }
public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var docNum = reader.NumDocs(); ProgramProgress progress = new ProgramProgress(docNum); XmlDoc[] xmlDocs = new XmlDoc[docNum]; for (int iDoc = 0; iDoc < docNum; iDoc++) { var doc = reader.Document(iDoc); xmlDocs[iDoc] = new XmlDoc(doc); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); //序列化这个对象 XmlSerializer serializer = new XmlSerializer(typeof(XmlDocCollection)); ////将对象序列化输出到控制台 serializer.Serialize(new StreamWriter(Configure.OutputPath), new XmlDocCollection() { XmlDocs = xmlDocs }); }
/// <summary> /// Calculate mention similarity matrix of tweet clusters /// </summary> public static void mentionSimilarity(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); StreamReader sr1 = new StreamReader("generalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterMentionSimilarity.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); var mentionList = new List <HashSet <string> >(); string line; while ((line = sr.ReadLine()) != null) { line = sr.ReadLine(); sr.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } sr1.ReadLine(); line = sr1.ReadLine(); sr1.ReadLine(); iDocStrArray = Regex.Split(line, " "); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } var mention = new HashSet <string>(); for (int i = 0; i < iDocList.Count; i++) { Document inDoc = indexReader.Document(iDocList[i]); string userSrnName = inDoc.Get("UserScreenName"); mention.Add(userSrnName); string text = inDoc.Get("Text"); MatchCollection mc; mc = Regex.Matches(text, @"@[A-Za-z0-9_]+"); var it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); mention.Add(str.Substring(1)); } } mentionList.Add(mention); } for (int i = 0; i < mentionList.Count; i++) { var mention1 = mentionList[i]; for (int j = 0; j < mentionList.Count; j++) { var mention2 = mentionList[j]; int sim = 0; foreach (var name in mention1) { if (mention2.Contains(name)) { sim = 1; break; } } sw.Write(sim + " "); } sw.WriteLine(); } sw.Close(); fs.Close(); sr1.Close(); sr.Close(); }
public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig) { var indexReader = LuceneOperations.GetIndexReader(inputPath); var docNum = indexReader.NumDocs(); int[] docWordCnt = new int[docNum]; int[] docUniqWordCnt = new int[docNum]; Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>(); Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>(); var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector ? BingNewsFields.FeatureVectorFieldWeights : BingNewsFields.NewsFieldWeights; ProgramProgress progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var document = indexReader.Document(iDoc); var content = LuceneOperations.GetContent(document, fieldWeights); var words = NLPOperations.Tokenize(content, tokenConfig); var uniqueWords = new HashSet <string>(words); docWordCnt[iDoc] = words.Count; docUniqWordCnt[iDoc] = uniqueWords.Count; foreach (var word in uniqueWords) { if (!wordDocCntDict.ContainsKey(word)) { wordDocCntDict.Add(word, 0); } wordDocCntDict[word]++; } foreach (var word in words) { if (!wordOccCntDict.ContainsKey(word)) { wordOccCntDict.Add(word, 0); } wordOccCntDict[word]++; } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); indexReader.Close(); //Statistics DoubleStatistics statDocWordCnt = new DoubleStatistics(); DoubleStatistics statDocUniqWordCnt = new DoubleStatistics(); DoubleStatistics statWordDocCnt = new DoubleStatistics(); DoubleStatistics statWordOccCnt = new DoubleStatistics(); for (int iDoc = 0; iDoc < docNum; iDoc++) { statDocWordCnt.AddNumber(docWordCnt[iDoc]); statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]); } foreach (var kvp in wordDocCntDict) { statWordDocCnt.AddNumber(kvp.Value); } foreach (var kvp in wordOccCntDict) { statWordOccCnt.AddNumber(kvp.Value); } Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt")); Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt")); Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt")); Console.WriteLine(statWordOccCnt.ToString("wordOccCnt")); //Hist var docWordCntHist = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1); var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1); var wordDocCntHist = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000); var wordDocCntHist2 = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1); docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv"); docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv"); wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv"); wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv"); Console.Read(); }
public static void AnalyzeSearchWordSentiment(string indexPath, string field, string[] keywords, int printDocumentCnt = 10, string histogramField = null) { var searcher = LuceneOperations.GetIndexSearcher(indexPath); var reader = searcher.GetIndexReader(); var docIDs = LuceneOperations.Search(searcher, StringOperations.GetMergedString(keywords, " "), field); Console.WriteLine("Find {0}% ({1}/{2}) documents containing: {3}", (100.0 * docIDs.Count / reader.NumDocs()), docIDs.Count, reader.NumDocs(), StringOperations.GetMergedString(keywords, " ")); var progress = new ProgramProgress(docIDs.Count); var sentiAnalyzer = new SentimentAnalyzer(); SentimentType sentimentType; double sentimentScore; HeapSortDouble hsdPos = new HeapSortDouble(printDocumentCnt); HeapSortDouble hsdNeg = new HeapSortDouble(printDocumentCnt); Counter <string> counterPos = null; Counter <string> counterNeg = null; Counter <string> counterNeu = null; if (histogramField != null) { counterPos = new Counter <string>(); counterNeg = new Counter <string>(); counterNeu = new Counter <string>(); } int posCnt = 0; int negCnt = 0; int neuCnt = 0; foreach (var docID in docIDs) { var document = reader.Document(docID); var content = document.Get(field); sentiAnalyzer.GetSentiment(content, out sentimentType, out sentimentScore); switch (sentimentType) { case SentimentType.Positive: posCnt++; hsdPos.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterPos.Add(document.Get(histogramField)); } break; case SentimentType.Negative: negCnt++; hsdNeg.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterNeg.Add(document.Get(histogramField)); } break; case SentimentType.Neutral: neuCnt++; if (histogramField != null) { counterNeu.Add(document.Get(histogramField)); } break; default: throw new NotImplementedException(); } progress.PrintIncrementExperiment(); } Console.WriteLine("Positive document ratio {0}% ({1}/{2})", Math.Round(100.0 * posCnt / docIDs.Count), posCnt, docIDs.Count); Console.WriteLine("Negatvie document ratio {0}% ({1}/{2})", Math.Round(100.0 * negCnt / docIDs.Count), negCnt, docIDs.Count); Console.WriteLine("Neutral document ratio {0}% ({1}/{2})", Math.Round(100.0 * neuCnt / docIDs.Count), neuCnt, docIDs.Count); Console.WriteLine(StringOperations.WrapWithDash("Positive documents")); foreach (var kvp in hsdPos.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } Console.WriteLine(StringOperations.WrapWithDash("Negative documents")); foreach (var kvp in hsdNeg.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } progress.PrintTotalTime(); if (histogramField != null) { string[] featureStrings = new[] { "Pos", "Neg", "Neu" }; Counter <string>[] counters = new[] { counterPos, counterNeg, counterNeu }; for (int i = 0; i < featureStrings.Length; i++) { Console.WriteLine(StringOperations.WrapWithDash(histogramField + " " + featureStrings[i])); int index = 0; foreach (var kvp in counters[i].GetCountDictionary().OrderByDescending(kvp => kvp.Value)) { Console.WriteLine(kvp.Key + "\t" + kvp.Value); if (++index >= 100) { break; } } } } Console.ReadKey(); }
public void Transform(string inputFolder, string indexPath, HashSet <string> keywords) { Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords)); Console.WriteLine("InputFolder: " + inputFolder + "\n"); string notParseSpecString = "Temp-DoNotParse"; inputFolder = StringOperations.EnsureFolderEnd(inputFolder); string[] schema = new[] { "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount", "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount" }; var schemeDict = Util.GetInvertedDictionary(schema); var textFieldIndex = schemeDict["Text"]; var createdTimeFieldIndex = schemeDict["CreatedAt"]; var userIdFieldIndex = schemeDict["UserId"]; //string outputPath = inputFolder + notParseSpecString + "\\"; //if (Directory.Exists(outputPath)) //{ // Directory.Delete(outputPath, true); //} //Directory.CreateDirectory(outputPath); //var indexPath = outputPath + "Index\\"; if (Directory.Exists(indexPath)) { Directory.Delete(indexPath, true); } var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories); //Preprocess Console.WriteLine("Start preprocesing..."); ProgramProgress progress = new ProgramProgress(files.Length); int estiDocCnt = 0; foreach (var file in files) { estiDocCnt += FileOperations.GetLineCount(file); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n"); //Parse Console.WriteLine("Start parsing..."); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); progress = new ProgramProgress(estiDocCnt); var sep = new char[] { '\t' }; int uniqDocFoundCnt = 0; int docFoundCnt = 0; int docCnt = 0; ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>(); int notUsedDocCnt = 0; foreach (var file in files) { if (file.Contains(notParseSpecString)) { continue; } if (file.EndsWith(".txt")) { var sr = new StreamReader(file); string line; while ((line = sr.ReadLine()) != null) { var tokens = line.Split(sep, StringSplitOptions.None); if (tokens.Length != schema.Length) { notUsedDocCnt++; continue; //throw new ArgumentException(); } var words = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig); bool isContainSearch = false; foreach (var word in words) { if (keywords.Contains(word)) { isContainSearch = true; break; } } if (isContainSearch) { string createdAt = tokens[createdTimeFieldIndex]; long userId = long.Parse(tokens[userIdFieldIndex]); string text = tokens[textFieldIndex]; if (!hash3Layer.Contains(createdAt, userId, text)) { var document = new Document(); for (int i = 0; i < schema.Length; i++) { document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(document); hash3Layer.Add(createdAt, userId, text); uniqDocFoundCnt++; } docFoundCnt++; } docCnt++; progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt)))); } sr.Close(); } } progress.PrintTotalTime(); Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt)); Console.WriteLine("Not used doc count: " + notUsedDocCnt); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Console.WriteLine("Finish"); Console.ReadKey(); }
public void Start() { if (!outputpath.EndsWith("\\")) { outputpath += "\\"; } var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr); var searcher = LuceneOperations.GetIndexSearcher(inputpath); var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs()); var scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); int iter = 0; bool bContinue = threshold == 0 ? false : true; while (bContinue && iter < 5) { iter++; Console.WriteLine("iteration------------------" + iter); List <string> keywordsNew; #region Calculate Keywords var counter = new Counter <string>(); foreach (var scoredDoc in scoredDocs) { var doc = searcher.Doc(scoredDoc.doc); var content = doc.Get(searchfield); foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig)) { counter.Add(word); } } keywordsNew = counter.GetMostFreqObjs(keywordNum); #endregion var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num); #region Test whether exit int repeatNum = 0; var docIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { docIDs.Add(scoredDoc.doc); } foreach (var scoredDocNew in scoredDocsNew) { if (docIDs.Contains(scoredDocNew.doc)) { repeatNum++; } } bContinue = (double)repeatNum / scoredDocs.Length < threshold; #endregion Console.WriteLine(repeatNum + " " + scoredDocsNew.Length); keywords = keywordsNew; scoredDocs = scoredDocsNew; Console.WriteLine(StringOperations.GetMergedString(keywords)); } max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs()); scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); var writer = LuceneOperations.GetIndexWriter(outputpath); foreach (var scoredDoc in scoredDocs) { Document doc = searcher.Doc(scoredDoc.doc); writer.AddDocument(doc); } writer.Optimize(); writer.Close(); if (isPrintRemovedDocuments) { var sw = new StreamWriter(outputpath + "removeDocuments.txt"); var selectedDocIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { selectedDocIDs.Add(scoredDoc.doc); } var reader = searcher.GetIndexReader(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (!selectedDocIDs.Contains(iDoc)) { sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc))); } } reader.Close(); sw.Flush(); sw.Close(); } searcher.Close(); Console.WriteLine("Done"); Console.ReadKey(); }
public void Start() { Initialize(); var reader = LuceneOperations.GetIndexReader(Configure.InputPath); InitializeWriters(); var docNum = reader.NumDocs(); var progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var doc = reader.Document(iDoc); bool isSkip = false; //random sample if (!isSkip && Configure.IsSampling) { if (Random.NextDouble() > Configure.SampleRatio) { isSkip = true; } } //filter by time if (!isSkip && Configure.IsSelectByTime) { var dateTime = StringOperations.ParseDateTimeString( doc.Get(Configure.TimeField), Configure.ParseTimeFormat); if (dateTime.Subtract(StartDateTime).Ticks < 0 || dateTime.Subtract(EndDateTime).Ticks > 0) { isSkip = true; } } //filter by exact match if (!isSkip && Configure.IsSelectByExactMatch) { foreach (var kvp in Configure.FieldMatchDict) { if (doc.Get(kvp.Key) != kvp.Value) { isSkip = true; break; } } } if (!isSkip) { GetWriter(doc).AddDocument(doc); } progress.PrintIncrementExperiment(); } CloseWriters(); reader.Close(); }
/// <summary> /// Select a representative tweet for each tweet cluster /// Output: clusterRepIDoc.txt, clusterRepText.txt, clusterRepWords.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> /// <param name="gramsList">List of 3-grams sets of signal tweets in each signal tweet cluster</param> /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list #</param> public static void selectRepresentative(string fileName, List <List <HashSet <string> > > gramsList, Dictionary <int, int> iDoc2rec) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterRepIDoc.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); FileStream fs1 = new FileStream("clusterRepText.txt", FileMode.Create); StreamWriter sw1 = new StreamWriter(fs1, Encoding.Default); FileStream fs2 = new FileStream("clusterRepWords.txt", FileMode.Create); StreamWriter sw2 = new StreamWriter(fs2, Encoding.Default); string line; while ((line = sr.ReadLine()) != null) { line = sr.ReadLine(); sr.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } double[] simArr = new double[iDocList.Count]; for (int i = 0; i < iDocList.Count; i++) { simArr[i] = 0.0; } for (int i = 0; i < iDocList.Count; i++) { int rec1 = iDoc2rec[iDocList[i]]; for (int j = i + 1; j < iDocList.Count; j++) { int rec2 = iDoc2rec[iDocList[j]]; double sim = ClusterGeneral.jaccard(gramsList[rec1], gramsList[rec2]); simArr[i] += sim; simArr[j] += sim; } } if (iDocList.Count > 1) { for (int i = 0; i < iDocList.Count; i++) { simArr[i] /= (iDocList.Count - 1); } } double maxSim = -1.0; int maxSimIndex = -1; for (int i = 0; i < iDocList.Count; i++) { if (simArr[i] > maxSim) { maxSim = simArr[i]; maxSimIndex = i; } } int iDoc = iDocList[maxSimIndex]; Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"#n#", ""); string words = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", ""); sw.WriteLine(iDoc); sw1.WriteLine(text); sw2.WriteLine(words); } sw2.Close(); fs2.Close(); sw1.Close(); fs1.Close(); sw.Close(); fs.Close(); sr.Close(); }
public void Start() { string inputPath = @"D:\DataProcess\TweetIndex\tweets-Ebola-20150101-20150228_dedup\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter2\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); char[] seperator = new char[] { ' ' }; string[] aidFields = new string[] { "User_FollowersCount", "User_Name", "User_ScreenName", "Retweet", "Mention" }; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); //for (int iDoc = 0; iDoc < 1000; iDoc++) for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = new Document(); string inTime = inDoc.Get("CreateAt"); DateTime dateTime = DateTime.Parse(inTime); outDoc.Add(new Field(BingNewsFields.DiscoveryStringTime, dateTime.ToString(BingNewsFields.TimeFormat), Field.Store.YES, Field.Index.ANALYZED)); string hashtag = inDoc.Get("Hashtag"); string word = inDoc.Get("Word"); if (hashtag == null) { hashtag = ""; } var hashtagTokens = hashtag.Split(seperator, StringSplitOptions.RemoveEmptyEntries); var wordTokens = word.Split(seperator, StringSplitOptions.RemoveEmptyEntries); string title = hashtagTokens.Length > 0 ? hashtagTokens[0] : wordTokens.Length > 0 ? wordTokens[0] : ""; outDoc.Add(new Field(BingNewsFields.NewsArticleHeadline, title, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.NewsArticleDescription, inDoc.Get("Text"), Field.Store.YES, Field.Index.ANALYZED)); string featureVector = ""; Counter <string> counter = new Counter <string>(); foreach (var tag in hashtagTokens) { counter.Add(tag); counter.Add(tag); } foreach (var w in wordTokens) { counter.Add(w); } foreach (var kvp in counter.GetSortedCountDictioanry()) { featureVector += string.Format("{0}({1})\\n", kvp.Key, kvp.Value); } outDoc.Add(new Field(BingNewsFields.FeatureVector, featureVector, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocId, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocumentURL, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); foreach (var aidField in aidFields) { var value = inDoc.Get(aidField); outDoc.Add(new Field(aidField, value == null ? "" : value, Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(outDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }
private void BuildFromBingNewsXMLs() { string[] selectedFields = new string[] { "DocumentURL", "DocumentUrl", "Country", "NewsArticleCategoryData", "NewsArticleHeadline", "NewsArticleDescription", "DiscoveryStringTime", "PublishedDateTime", "DownloadStringTime", "PublishedDateTime", "NewsSource" }; //NewsArticleBodyNEMap, RealTimeType List <string> bingnewspaths = BingNewsConfigure.BingNewsPaths; int iProcessor = BingNewsConfigure.iProcessor; int processorNum = BingNewsConfigure.ProcessorNum; string startdate = BingNewsConfigure.StartDate; string enddate = BingNewsConfigure.EndDate; List <string[]> keywordLists = BingNewsConfigure.KeywordLists; List <string> indexpaths = BingNewsConfigure.IndexPaths; List <string> languages = BingNewsConfigure.Languages; int maxThreadNum = BingNewsConfigure.MaxThreadNum; //LoadExtractBingNewsDataConfig_KeyWordList(out bingnewspaths, // out iProcessor, out processorNum, out startdate, out enddate, // out keywordLists, out languages, out indexpaths); List <string> outputdirs = new List <string>(); List <string> infofilenames = new List <string>(); int ikeyword2 = 0; foreach (string indexpath in indexpaths) { string outputdir = indexpath + "BingNews_" + keywordLists[ikeyword2][0] + "_" + iProcessor + "_" + processorNum; if (!Directory.Exists(outputdir)) { Directory.CreateDirectory(outputdir); } infofilenames.Add(indexpath + "BingNews_" + keywordLists[ikeyword2][0] + "_" + iProcessor + "_" + processorNum + ".dat"); outputdirs.Add(outputdir); ikeyword2++; } List <IndexWriter> indexwriters = new List <IndexWriter>(); List <StreamWriter> infofiles = new List <StreamWriter>(); for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { IndexWriter indexwriter = LuceneOperations.GetIndexWriter(outputdirs[ikeyword2]); StreamWriter infofile = new StreamWriter(infofilenames[ikeyword2]); indexwriters.Add(indexwriter); infofiles.Add(infofile); } List <string> allfilenames = new List <string>(); foreach (var bingnewpath in bingnewspaths) { allfilenames.AddRange(Directory.GetFiles(bingnewpath, "*.*", System.IO.SearchOption.AllDirectories)); } allfilenames = FilterDates(allfilenames, startdate, enddate).ToList(); List <string> filenames = new List <string>(); for (int i = iProcessor; i < allfilenames.Count; i += processorNum) { filenames.Add(allfilenames[i]); } Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); ProgramProgress progress = new ProgramProgress(filenames.Count); //ProgramProgress progress = new ProgramProgress(filenames.Count / processorNum); int[] newsfoundcnts = new int[keywordLists.Count]; DateTime time_begin_1 = DateTime.Now; //for (int ifilename = iProcessor; ifilename < filenames.Count; ifilename += processorNum) if (maxThreadNum == 1) { foreach (var filename in filenames) { BuildLuceneFromFile(filename, keywordLists, indexwriters, languages, selectedFields, newsfoundcnts, infofiles, progress); } } else { ParallelOptions options = new ParallelOptions(); options.MaxDegreeOfParallelism = maxThreadNum; object obj = new Object(); Parallel.ForEach(filenames, options, filename => BuildLuceneFromFile(filename, keywordLists, indexwriters, languages, selectedFields, newsfoundcnts, infofiles, progress)); } for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { infofiles[ikeyword2].WriteLine("Extract xml time\t" + stopwatch.Elapsed); } Console.WriteLine("Start writing to lucene index..."); Stopwatch stopwatch2 = new Stopwatch(); stopwatch2.Start(); for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { indexwriters[ikeyword2].Optimize(); indexwriters[ikeyword2].Close(); } for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++) { infofiles[ikeyword2].WriteLine("Write to lucene index time\t" + stopwatch2.Elapsed); infofiles[ikeyword2].WriteLine("Total time\t" + stopwatch.Elapsed); infofiles[ikeyword2].Flush(); infofiles[ikeyword2].Close(); } }
public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType) { double tweetCnt = 0; TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } bool isContainSearch = false; switch (searchType) { case SearchSpinn3rType.Main: var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig); foreach (var word in words) { if (searchHashSet.Contains(word)) { isContainSearch = true; break; } } break; case SearchSpinn3rType.User: isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower()); break; default: throw new ArgumentException(); } if (isContainSearch) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Util.ProgramFinishHalt(); }