public void Start() { var writer = LuceneOperations.GetIndexWriter(OutputPath); var totalDocCnt = 0; foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); totalDocCnt += reader.NumDocs(); reader.Close(); } var progress = new ProgramProgress(totalDocCnt); foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { writer.AddDocument(reader.Document(iDoc)); progress.PrintIncrementExperiment(); } reader.Close(); } writer.Optimize(); writer.Close(); }
public void StartTransformTweetIndexForStreamingRoseRiver() { string inputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01_MOD\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); string docIDField = BingNewsFields.DocId; string urlField = BingNewsFields.DocumentURL; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = inDoc; outDoc.RemoveField(docIDField); outDoc.Add(new Field(docIDField, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.RemoveField(urlField); outDoc.Add(new Field(urlField, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(inDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }
public static void AnalyzeLanguageDistribution(string inputPath) { Counter <string> counter = new Counter <string>(); var filenames = Directory.GetFiles(inputPath, "*.*", System.IO.SearchOption.AllDirectories); ProgramProgress progress = new ProgramProgress(filenames.Length, PrintType.Console); foreach (var filename in filenames) { ZipFile zipfile = null; List <XmlDocument> xmldocs = new List <XmlDocument>(); if (filename.EndsWith(".zip")) { zipfile = new ZipFile(filename); MemoryStream ms = new MemoryStream(); foreach (ZipEntry entry in zipfile.Entries) { entry.Extract(ms); ms.Position = 0; XmlDocument xmldoc = new XmlDocument(); xmldoc.Load(ms); xmldocs.Add(xmldoc); ms.Dispose(); } } else { try { XmlDocument xmldoc = new XmlDocument(); xmldoc.Load(filename); xmldocs.Add(xmldoc); } catch { var xmldoclist = DataProcess.DataTransform.BuildLuceneIndex.GetXMLDocList(filename); xmldocs.AddRange(xmldoclist); } } foreach (XmlDocument xmldoc in xmldocs) { XmlNodeList list = xmldoc.GetElementsByTagName("NewsArticleDescription"); foreach (XmlNode bodynemapnode in list) { XmlNode newsnode = bodynemapnode.ParentNode; XmlNode languagenode = newsnode.SelectSingleNode("Language"); counter.Add(languagenode.InnerText); } /// Delete temp file //File.Delete(extractpath + entry.FileName); } progress.PrintIncrementExperiment(); } foreach (var kvp in counter.GetCountDictionary()) { Console.WriteLine(kvp.Key + "\t" + kvp.Value); } }
/// <summary> /// Twitter data: from cosmos, each line represents a Tweet. /// Different fields are seperated by '\t'. The schema is the name for each field /// </summary> private void BuildFromTwitterTxt() { string inputpath = TwitterConfigure.InputPath; string outputpath = TwitterConfigure.OutputPath; var schema = TwitterConfigure.TwitterSchema; string bodyField = TwitterConfigure.TwitterBodyField; var indexwriter = LuceneOperations.GetIndexWriter(outputpath); StreamReader sr = new StreamReader(inputpath); string line; int lineCnt = 0; while ((line = sr.ReadLine()) != null) { lineCnt++; } //Console.WriteLine("Total Lines: " + lineCnt); sr.Close(); sr = new StreamReader(inputpath); var seperator = new char[] { '\t' }; int lineIndex = 0; var progress = new ProgramProgress(lineCnt); while ((line = sr.ReadLine()) != null) { //if (lineIndex % 100000 == 0) // Console.WriteLine("{0} out of {1} ({2}%)", lineIndex, lineCnt, 100 * lineIndex / lineCnt); var tokens = line.Split(seperator);//, StringSplitOptions.RemoveEmptyEntries); if (tokens.Length != schema.Length) { throw new Exception("Unmatch schema"); } var document = new Document(); for (int i = 0; i < tokens.Length; i++) { if (schema[i] == bodyField) { tokens[i] = RemoveContentNoise.RemoveTweetIndexNoise(tokens[i]); } document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexwriter.AddDocument(document); lineIndex++; progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); sr.Close(); indexwriter.Optimize(); indexwriter.Close(); }
public void TransformWithFileNameContentSearch(string[] files, string indexPath, string searchStr, string progressEndStr = null) { double tweetCnt = 0; var indexWriter = LuceneOperations.GetIndexWriter(indexPath); searchStr = searchStr.ToLower(); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } if (tweet.main.ToLower().Contains(searchStr)) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%) -- {3}", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount, progressEndStr)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); //Util.ProgramFinishHalt(); }
public void AnalyzeDocuments() { string fileName = @"D:\Project\TopicPanorama\data\TopicGraphs\NewCode-Ebola-Test2\Raw\news\result\lda.top.json"; string indexPath = @"D:\DataProcess\Index\Raw_EbolaEnBingNews_Ebola_0_1_RS_R-1"; int topDocCnt = 20; var indexReader = LuceneOperations.GetIndexReader(indexPath); //Read from json and sort SimpleJsonReader reader = new SimpleJsonReader(new StreamReader(File.Open(fileName, FileMode.Open))); HeapSortDouble[] hsd = null; int topicNumber = -1; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); while (reader.IsReadable) { int docID = int.Parse(reader.ReadPropertyName()); double[] topicArray = reader.ReadDoubleArray(); if (topicNumber < 0) { topicNumber = topicArray.Length; hsd = new HeapSortDouble[topicNumber]; for (int i = 0; i < topicNumber; i++) { hsd[i] = new HeapSortDouble(topDocCnt); } } for (int i = 0; i < topicNumber; i++) { hsd[i].Insert(docID, topicArray[i]); } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); //Statistics Console.ReadLine(); }
private List <int> RemoveSameURLDocument(IndexReader reader, List <int> orgDocIDs) { var newDocIDs = new List <int>(); var docNum = orgDocIDs.Count; HashSet <string> urlHash = new HashSet <string>(); Console.WriteLine("Total {0} docs", docNum); int removeDocNum = 0; string urlfield = Configure.URLField; var progress = new ProgramProgress(docNum); foreach (var iDoc in orgDocIDs) { var document = reader.Document(iDoc); string url = document.Get(urlfield); if (url != null) { url = url.ToLower(); if (!urlHash.Contains(url)) { newDocIDs.Add(iDoc); urlHash.Add(url); } else { removeDocNum++; } } progress.PrintIncrementExperiment(); } Console.WriteLine("Finished remove same URL. Removed {0} out of {1}", removeDocNum, docNum); return(newDocIDs); }
public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var docNum = reader.NumDocs(); ProgramProgress progress = new ProgramProgress(docNum); XmlDoc[] xmlDocs = new XmlDoc[docNum]; for (int iDoc = 0; iDoc < docNum; iDoc++) { var doc = reader.Document(iDoc); xmlDocs[iDoc] = new XmlDoc(doc); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); //序列化这个对象 XmlSerializer serializer = new XmlSerializer(typeof(XmlDocCollection)); ////将对象序列化输出到控制台 serializer.Serialize(new StreamWriter(Configure.OutputPath), new XmlDocCollection() { XmlDocs = xmlDocs }); }
private void BuildLuceneFromFile(string filename, List <string[]> keywordLists, List <IndexWriter> indexwriters, List <string> languages, string[] selectedFields, int[] newsfoundcnts, List <StreamWriter> infofiles, ProgramProgress progress) { //string filename = filenames[ifilename]; int deltanewsfoundcnt = 0; ZipFile zipfile = null; List <XmlDocument> xmldocs = new List <XmlDocument>(); if (filename.EndsWith(".zip")) { zipfile = new ZipFile(filename); MemoryStream ms = new MemoryStream(); foreach (ZipEntry entry in zipfile.Entries) { entry.Extract(ms); ms.Position = 0; XmlDocument xmldoc = new XmlDocument(); xmldoc.Load(ms); xmldocs.Add(xmldoc); ms.Dispose(); } } else { try { XmlDocument xmldoc = new XmlDocument(); xmldoc.Load(filename); xmldocs.Add(xmldoc); } catch { var xmldoclist = GetXMLDocList(filename); xmldocs.AddRange(xmldoclist); } } foreach (XmlDocument xmldoc in xmldocs) { XmlNodeList list = xmldoc.GetElementsByTagName("NewsArticleDescription"); foreach (XmlNode bodynemapnode in list) { for (int ikeyword = 0; ikeyword < keywordLists.Count; Interlocked.Increment(ref ikeyword)) { var keywords = keywordLists[ikeyword]; IndexWriter indexwriter = indexwriters[ikeyword]; string str = bodynemapnode.InnerText; bool bStore = false; foreach (var keyword in keywords) { if (str.Contains(keyword)) { bStore = true; break; } } if (bStore) { XmlNode newsnode = bodynemapnode.ParentNode; XmlNode languagenode = newsnode.SelectSingleNode("Language"); //Test whether it is written in english if (!languages.Contains(languagenode.InnerText)) { continue; } /// Unique Document /// //Extract all useful fields string docid = newsnode.Attributes[0].Value; Document document = new Document(); document.Add(new Field("DocId", docid, Field.Store.YES, Field.Index.ANALYZED)); foreach (string fieldname in selectedFields) { XmlNode node = newsnode.SelectSingleNode(fieldname); if (node != null) { string luceneFieldName = fieldname; if (luceneFieldName == "DocumentUrl") { luceneFieldName = "DocumentURL"; } document.Add(new Field(luceneFieldName, node.InnerText, Field.Store.YES, Field.Index.ANALYZED)); } } indexwriter.AddDocument(document); Interlocked.Increment(ref newsfoundcnts[ikeyword]); deltanewsfoundcnt++; } } } /// Delete temp file //File.Delete(extractpath + entry.FileName); } for (int ikeyword = 0; ikeyword < keywordLists.Count; ikeyword++) { infofiles[ikeyword].WriteLine(filename + "\t\t" + deltanewsfoundcnt + "\t\t" + newsfoundcnts[ikeyword]); infofiles[ikeyword].Flush(); } ; progress.PrintIncrementExperiment(); }
public void StartKDD() { // -- node counts -- string folder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\Data\KddInfovisGraphicsIndex_Lucene_a=0.003_sm=1\"; string exeFolder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\RoseRiver\bin\x64\Release\"; List <int> nodeCounts = new List <int>(); for (int i = 0; i < 11; i++) { var fileName = folder + i + ".gv"; var tree = BRTAnalysis.ReadTree(fileName); nodeCounts.Add(tree.BFS(tree.Root).Count()); } // -- experiment -- var copyFactors = new[] { 2, 1 }; var focusCounts = DataProcess.Utils.Util.GetIntArray(1, 5); var focusSampleCount = 5; var minMaxTreeCount = 6; var maxMaxTreeCount = 8; int index = 0; ProgramProgress progress = new ProgramProgress(copyFactors.Length * focusCounts.Length * focusSampleCount * (maxMaxTreeCount - minMaxTreeCount + 1)); var configure = new TopicStreamConfigure(); configure.DataType = "kdd"; foreach (var copyFactor in copyFactors) { configure.CopyFactor = copyFactor; foreach (var focusCount in focusCounts) { for (int iFocusSample = 0; iFocusSample < focusSampleCount; iFocusSample++) { configure.FocusCount = focusCount; configure.DefaultTreeCut = GetRandomManuallyTreeCut(focusCount, minMaxTreeCount, iFocusSample, nodeCounts, 1); configure.DefaultTreeCutRandomSeed = iFocusSample; for (int iMaxTreeCount = minMaxTreeCount; iMaxTreeCount <= maxMaxTreeCount; iMaxTreeCount++) { configure.TreeCount = iMaxTreeCount; configure.Index = index; configure.Write(); File.Copy(TopicStreamConfigure.ConfigureFileName, exeFolder + TopicStreamConfigure.ConfigureFileName, true); ProcessStartInfo startInfo = new ProcessStartInfo(); startInfo.CreateNoWindow = true; startInfo.UseShellExecute = false; startInfo.FileName = exeFolder + @"RoseRiver.exe"; startInfo.WindowStyle = ProcessWindowStyle.Hidden; using (Process exeProcess = Process.Start(startInfo)) { exeProcess.WaitForExit(); } progress.PrintIncrementExperiment("\n"); index++; } } } } progress.PrintTotalTime(); }
public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig) { var indexReader = LuceneOperations.GetIndexReader(inputPath); var docNum = indexReader.NumDocs(); int[] docWordCnt = new int[docNum]; int[] docUniqWordCnt = new int[docNum]; Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>(); Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>(); var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector ? BingNewsFields.FeatureVectorFieldWeights : BingNewsFields.NewsFieldWeights; ProgramProgress progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var document = indexReader.Document(iDoc); var content = LuceneOperations.GetContent(document, fieldWeights); var words = NLPOperations.Tokenize(content, tokenConfig); var uniqueWords = new HashSet <string>(words); docWordCnt[iDoc] = words.Count; docUniqWordCnt[iDoc] = uniqueWords.Count; foreach (var word in uniqueWords) { if (!wordDocCntDict.ContainsKey(word)) { wordDocCntDict.Add(word, 0); } wordDocCntDict[word]++; } foreach (var word in words) { if (!wordOccCntDict.ContainsKey(word)) { wordOccCntDict.Add(word, 0); } wordOccCntDict[word]++; } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); indexReader.Close(); //Statistics DoubleStatistics statDocWordCnt = new DoubleStatistics(); DoubleStatistics statDocUniqWordCnt = new DoubleStatistics(); DoubleStatistics statWordDocCnt = new DoubleStatistics(); DoubleStatistics statWordOccCnt = new DoubleStatistics(); for (int iDoc = 0; iDoc < docNum; iDoc++) { statDocWordCnt.AddNumber(docWordCnt[iDoc]); statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]); } foreach (var kvp in wordDocCntDict) { statWordDocCnt.AddNumber(kvp.Value); } foreach (var kvp in wordOccCntDict) { statWordOccCnt.AddNumber(kvp.Value); } Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt")); Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt")); Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt")); Console.WriteLine(statWordOccCnt.ToString("wordOccCnt")); //Hist var docWordCntHist = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1); var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1); var wordDocCntHist = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000); var wordDocCntHist2 = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1); docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv"); docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv"); wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv"); wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv"); Console.Read(); }
public static void AnalyzeSearchWordSentiment(string indexPath, string field, string[] keywords, int printDocumentCnt = 10, string histogramField = null) { var searcher = LuceneOperations.GetIndexSearcher(indexPath); var reader = searcher.GetIndexReader(); var docIDs = LuceneOperations.Search(searcher, StringOperations.GetMergedString(keywords, " "), field); Console.WriteLine("Find {0}% ({1}/{2}) documents containing: {3}", (100.0 * docIDs.Count / reader.NumDocs()), docIDs.Count, reader.NumDocs(), StringOperations.GetMergedString(keywords, " ")); var progress = new ProgramProgress(docIDs.Count); var sentiAnalyzer = new SentimentAnalyzer(); SentimentType sentimentType; double sentimentScore; HeapSortDouble hsdPos = new HeapSortDouble(printDocumentCnt); HeapSortDouble hsdNeg = new HeapSortDouble(printDocumentCnt); Counter <string> counterPos = null; Counter <string> counterNeg = null; Counter <string> counterNeu = null; if (histogramField != null) { counterPos = new Counter <string>(); counterNeg = new Counter <string>(); counterNeu = new Counter <string>(); } int posCnt = 0; int negCnt = 0; int neuCnt = 0; foreach (var docID in docIDs) { var document = reader.Document(docID); var content = document.Get(field); sentiAnalyzer.GetSentiment(content, out sentimentType, out sentimentScore); switch (sentimentType) { case SentimentType.Positive: posCnt++; hsdPos.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterPos.Add(document.Get(histogramField)); } break; case SentimentType.Negative: negCnt++; hsdNeg.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterNeg.Add(document.Get(histogramField)); } break; case SentimentType.Neutral: neuCnt++; if (histogramField != null) { counterNeu.Add(document.Get(histogramField)); } break; default: throw new NotImplementedException(); } progress.PrintIncrementExperiment(); } Console.WriteLine("Positive document ratio {0}% ({1}/{2})", Math.Round(100.0 * posCnt / docIDs.Count), posCnt, docIDs.Count); Console.WriteLine("Negatvie document ratio {0}% ({1}/{2})", Math.Round(100.0 * negCnt / docIDs.Count), negCnt, docIDs.Count); Console.WriteLine("Neutral document ratio {0}% ({1}/{2})", Math.Round(100.0 * neuCnt / docIDs.Count), neuCnt, docIDs.Count); Console.WriteLine(StringOperations.WrapWithDash("Positive documents")); foreach (var kvp in hsdPos.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } Console.WriteLine(StringOperations.WrapWithDash("Negative documents")); foreach (var kvp in hsdNeg.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } progress.PrintTotalTime(); if (histogramField != null) { string[] featureStrings = new[] { "Pos", "Neg", "Neu" }; Counter <string>[] counters = new[] { counterPos, counterNeg, counterNeu }; for (int i = 0; i < featureStrings.Length; i++) { Console.WriteLine(StringOperations.WrapWithDash(histogramField + " " + featureStrings[i])); int index = 0; foreach (var kvp in counters[i].GetCountDictionary().OrderByDescending(kvp => kvp.Value)) { Console.WriteLine(kvp.Key + "\t" + kvp.Value); if (++index >= 100) { break; } } } } Console.ReadKey(); }
public void Transform(string inputFolder, string indexPath, HashSet <string> keywords) { Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords)); Console.WriteLine("InputFolder: " + inputFolder + "\n"); string notParseSpecString = "Temp-DoNotParse"; inputFolder = StringOperations.EnsureFolderEnd(inputFolder); string[] schema = new[] { "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount", "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount" }; var schemeDict = Util.GetInvertedDictionary(schema); var textFieldIndex = schemeDict["Text"]; var createdTimeFieldIndex = schemeDict["CreatedAt"]; var userIdFieldIndex = schemeDict["UserId"]; //string outputPath = inputFolder + notParseSpecString + "\\"; //if (Directory.Exists(outputPath)) //{ // Directory.Delete(outputPath, true); //} //Directory.CreateDirectory(outputPath); //var indexPath = outputPath + "Index\\"; if (Directory.Exists(indexPath)) { Directory.Delete(indexPath, true); } var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories); //Preprocess Console.WriteLine("Start preprocesing..."); ProgramProgress progress = new ProgramProgress(files.Length); int estiDocCnt = 0; foreach (var file in files) { estiDocCnt += FileOperations.GetLineCount(file); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n"); //Parse Console.WriteLine("Start parsing..."); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); progress = new ProgramProgress(estiDocCnt); var sep = new char[] { '\t' }; int uniqDocFoundCnt = 0; int docFoundCnt = 0; int docCnt = 0; ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>(); int notUsedDocCnt = 0; foreach (var file in files) { if (file.Contains(notParseSpecString)) { continue; } if (file.EndsWith(".txt")) { var sr = new StreamReader(file); string line; while ((line = sr.ReadLine()) != null) { var tokens = line.Split(sep, StringSplitOptions.None); if (tokens.Length != schema.Length) { notUsedDocCnt++; continue; //throw new ArgumentException(); } var words = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig); bool isContainSearch = false; foreach (var word in words) { if (keywords.Contains(word)) { isContainSearch = true; break; } } if (isContainSearch) { string createdAt = tokens[createdTimeFieldIndex]; long userId = long.Parse(tokens[userIdFieldIndex]); string text = tokens[textFieldIndex]; if (!hash3Layer.Contains(createdAt, userId, text)) { var document = new Document(); for (int i = 0; i < schema.Length; i++) { document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(document); hash3Layer.Add(createdAt, userId, text); uniqDocFoundCnt++; } docFoundCnt++; } docCnt++; progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt)))); } sr.Close(); } } progress.PrintTotalTime(); Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%", uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt)); Console.WriteLine("Not used doc count: " + notUsedDocCnt); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Console.WriteLine("Finish"); Console.ReadKey(); }
public void Start() { Initialize(); var reader = LuceneOperations.GetIndexReader(Configure.InputPath); InitializeWriters(); var docNum = reader.NumDocs(); var progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var doc = reader.Document(iDoc); bool isSkip = false; //random sample if (!isSkip && Configure.IsSampling) { if (Random.NextDouble() > Configure.SampleRatio) { isSkip = true; } } //filter by time if (!isSkip && Configure.IsSelectByTime) { var dateTime = StringOperations.ParseDateTimeString( doc.Get(Configure.TimeField), Configure.ParseTimeFormat); if (dateTime.Subtract(StartDateTime).Ticks < 0 || dateTime.Subtract(EndDateTime).Ticks > 0) { isSkip = true; } } //filter by exact match if (!isSkip && Configure.IsSelectByExactMatch) { foreach (var kvp in Configure.FieldMatchDict) { if (doc.Get(kvp.Key) != kvp.Value) { isSkip = true; break; } } } if (!isSkip) { GetWriter(doc).AddDocument(doc); } progress.PrintIncrementExperiment(); } CloseWriters(); reader.Close(); }
public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType) { double tweetCnt = 0; TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter); var indexWriter = LuceneOperations.GetIndexWriter(indexPath); var progress = new ProgramProgress(files.Length); int docFoundCount = 0; int totalDocCount = 0; foreach (var file in files) { FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) => { tweetCnt += data.count; //Console.WriteLine(data.count); //Console.WriteLine(data.items[0].main); foreach (var tweet in data.items) { if (tweet.lang != "en") { continue; } bool isContainSearch = false; switch (searchType) { case SearchSpinn3rType.Main: var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig); foreach (var word in words) { if (searchHashSet.Contains(word)) { isContainSearch = true; break; } } break; case SearchSpinn3rType.User: isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower()); break; default: throw new ArgumentException(); } if (isContainSearch) { var document = new Document(); document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); docFoundCount++; } totalDocCount++; } }); progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount)); } progress.PrintTotalTime(); Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount); Console.WriteLine("Start writing index..."); indexWriter.Commit(); indexWriter.Close(); Util.ProgramFinishHalt(); }
private List <int> RemoveSimilarDocumentsGranu(IndexReader reader, List <int> orgDocIDs, int timeWindowSize, int wordWindowSize) { var newDocIDs = new List <int>(); var removeSimilarity = Configure.MinDistinctiveDocumentCosine; Dictionary <int, Dictionary <int, List <SparseVectorList> > > uniqueDocHash = new Dictionary <int, Dictionary <int, List <SparseVectorList> > >(); int docNum = orgDocIDs.Count; int removeDocNum = 0; Dictionary <string, int> lexicon = new Dictionary <string, int>(); int timeslicesize = 1; if (timeWindowSize >= 15) { int[] dividePieceNumbers = new int[] { 3, 4, 5, 7 }; foreach (int dividePieceNumber in dividePieceNumbers) { if (timeWindowSize % dividePieceNumber == 0) { timeslicesize = timeWindowSize / dividePieceNumber; break; } } if (timeslicesize == 1) { timeslicesize = (timeWindowSize + 2) / 3; timeWindowSize = 3; } else { timeWindowSize /= timeslicesize; } Console.WriteLine("Reset window size! TimeSliceSize: {0}, WindowSize: {1}", timeslicesize, timeWindowSize); } int begintimedelta = -(timeWindowSize - 1) / 2; int endtimedelta = timeWindowSize / 2; var progress = new ProgramProgress(docNum); StreamWriter debugSw = null; if (Configure.IsDebug) { string fileName = Configure.OutputPath + "debug.txt"; FileOperations.EnsureFileFolderExist(fileName); debugSw = new StreamWriter(fileName, true, Encoding.UTF8); } foreach (var iDoc in orgDocIDs) { var doc = reader.Document(iDoc); SparseVectorList vector = GetFeatureVector(doc, lexicon); if (vector == null) { removeDocNum++; continue; } vector.documentid = iDoc; int time = getDateTimeBingNews(doc) / timeslicesize; int[] words = getMostFreqWordIndex(vector, wordWindowSize); bool bunqiue = true; for (int stime = time + begintimedelta; stime <= time + endtimedelta; stime++) { if (uniqueDocHash.ContainsKey(stime)) { Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[stime]; foreach (int sword in words) { if (wordHash.ContainsKey(sword)) { List <SparseVectorList> vectorList = wordHash[sword]; foreach (SparseVectorList svector in vectorList) { if (SparseVectorList.Cosine(svector, vector) >= removeSimilarity) { if (Configure.IsDebug && removeDocNum <= 10000) { double simi = SparseVectorList.Cosine(svector, vector); if (simi <= Configure.MaxShowDebugCosine) { debugSw.WriteLine("---------------------------------------------------"); debugSw.WriteLine(reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleHeadline)); //Get("NewsArticleDescription")); debugSw.WriteLine(reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleHeadline)); //Get("NewsArticleDescription")); debugSw.WriteLine(""); string body1 = reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleDescription); string body2 = reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleDescription); if (body1.Length > 100) { body1 = body1.Substring(0, 100); } if (body2.Length > 100) { body2 = body2.Substring(0, 100); } debugSw.WriteLine(body1); debugSw.WriteLine(body2); debugSw.WriteLine(simi); } debugSw.Flush(); } bunqiue = false; break; } } } if (!bunqiue) { break; } } } if (!bunqiue) { break; } } if (bunqiue) { int keytime = time; int keyword = words[0]; if (!uniqueDocHash.ContainsKey(keytime)) { uniqueDocHash.Add(keytime, new Dictionary <int, List <SparseVectorList> >()); } Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[keytime]; if (!wordHash.ContainsKey(keyword)) { wordHash.Add(keyword, new List <SparseVectorList>()); } List <SparseVectorList> list = wordHash[keyword]; list.Add(vector); newDocIDs.Add(iDoc); } else { removeDocNum++; } progress.PrintIncrementExperiment(); } Console.WriteLine("Finished remove similar documents. Removed {0} out of {1}", removeDocNum, docNum); int listLengthSum = 0, listCnt = 0; foreach (Dictionary <int, List <SparseVectorList> > hash0 in uniqueDocHash.Values) { foreach (List <SparseVectorList> list in hash0.Values) { listLengthSum += list.Count; listCnt++; } } Console.WriteLine("AvgListLength: {0}, ListCnt: {1}", listLengthSum / listCnt, listCnt); if (Configure.IsDebug) { debugSw.Flush(); debugSw.Close(); } return(newDocIDs); }
public void Start() { string inputPath = @"D:\DataProcess\TweetIndex\tweets-Ebola-20150101-20150228_dedup\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter2\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); char[] seperator = new char[] { ' ' }; string[] aidFields = new string[] { "User_FollowersCount", "User_Name", "User_ScreenName", "Retweet", "Mention" }; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); //for (int iDoc = 0; iDoc < 1000; iDoc++) for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = new Document(); string inTime = inDoc.Get("CreateAt"); DateTime dateTime = DateTime.Parse(inTime); outDoc.Add(new Field(BingNewsFields.DiscoveryStringTime, dateTime.ToString(BingNewsFields.TimeFormat), Field.Store.YES, Field.Index.ANALYZED)); string hashtag = inDoc.Get("Hashtag"); string word = inDoc.Get("Word"); if (hashtag == null) { hashtag = ""; } var hashtagTokens = hashtag.Split(seperator, StringSplitOptions.RemoveEmptyEntries); var wordTokens = word.Split(seperator, StringSplitOptions.RemoveEmptyEntries); string title = hashtagTokens.Length > 0 ? hashtagTokens[0] : wordTokens.Length > 0 ? wordTokens[0] : ""; outDoc.Add(new Field(BingNewsFields.NewsArticleHeadline, title, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.NewsArticleDescription, inDoc.Get("Text"), Field.Store.YES, Field.Index.ANALYZED)); string featureVector = ""; Counter <string> counter = new Counter <string>(); foreach (var tag in hashtagTokens) { counter.Add(tag); counter.Add(tag); } foreach (var w in wordTokens) { counter.Add(w); } foreach (var kvp in counter.GetSortedCountDictioanry()) { featureVector += string.Format("{0}({1})\\n", kvp.Key, kvp.Value); } outDoc.Add(new Field(BingNewsFields.FeatureVector, featureVector, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocId, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocumentURL, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); foreach (var aidField in aidFields) { var value = inDoc.Get(aidField); outDoc.Add(new Field(aidField, value == null ? "" : value, Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(outDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }
//public void StartEbola(int[] focusSeeds) public void StartEbola() { // -- node counts -- string folder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\Trees3\"; string exeFolder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\RoseRiver\bin\x64\Release\"; if (!Directory.Exists(folder)) { folder = @"H:\Xiting\StreamingRoseRiver\ScalabilityExperiment\Data\Trees3\"; exeFolder = @"H:\Xiting\StreamingRoseRiver\ScalabilityExperiment\RoseRiverExe\"; } if (!Directory.Exists(folder)) { folder = @"D:\Documents\roseriver\RoseRiver\RoseRiver\Data\Ebola\Trees3\"; exeFolder = @"D:\Documents\roseriver\RoseRiver\RoseRiver\Data\Ebola\ScalabilityExperiment\RoseRiver\RoseRiver\bin\x64\Release\"; } List <int> nodeCounts = new List <int>(); for (int i = 0; i < 30; i++) { var fileName = folder + i + ".gv"; var tree = BRTAnalysis.ReadTree(fileName); nodeCounts.Add(tree.BFS(tree.Root).Count()); } // -- experiment -- var copyFactors = new[] { 1 }; //Util.GetIntArray(1, 9, 2); //new[] {1, 2, 5, 10, 20, 50}; var focusCounts = new[] { 1, 3, 5 }; //DataProcess.Utils.Util.GetIntArray(1, 5); //var focusSampleCount = 1;//50; var focusSeeds = Util.GetIntArray(51, 100); //Util.GetIntArray(1, 50); //new[] { 1 };//Util.GetIntArray(1, 50); //var minMaxTreeCount = 10; //var maxMaxTreeCount = 30; var treeCounts = Util.GetIntArray(5, 30); //new int[] { 5, 10 };//new[] {10, 20}; int index = 0; ProgramProgress progress = new ProgramProgress(copyFactors.Length * focusCounts.Length * focusSeeds.Length * treeCounts.Length); var configure = new TopicStreamConfigure(); foreach (int focusSeed in focusSeeds) { foreach (var copyFactor in copyFactors) { configure.CopyFactor = copyFactor; foreach (var focusCount in focusCounts) { configure.FocusCount = focusCount; configure.DefaultTreeCut = GetRandomManuallyTreeCut(focusCount, treeCounts.Min(), focusSeed, nodeCounts, 1); configure.DefaultTreeCutRandomSeed = focusSeed; foreach (var treeCount in treeCounts) { if (File.Exists("RunTimeExperiment\\" + index + ".txt")) { Console.WriteLine("Skip index = " + index); index++; progress.PrintSkipExperiment(); continue; } configure.TreeCount = treeCount; configure.Index = index; configure.Write(); File.Copy(TopicStreamConfigure.ConfigureFileName, exeFolder + TopicStreamConfigure.ConfigureFileName, true); ProcessStartInfo startInfo = new ProcessStartInfo(); startInfo.ErrorDialog = false; startInfo.CreateNoWindow = false; startInfo.UseShellExecute = false; startInfo.FileName = exeFolder + @"RoseRiver.exe"; startInfo.WindowStyle = ProcessWindowStyle.Normal; using (Process exeProcess = Process.Start(startInfo)) { exeProcess.WaitForExit(); } progress.PrintIncrementExperiment("\n"); index++; } } } } progress.PrintTotalTime(); }