public static void VisualizeTree(IEnumerable <string> brtFiles, string luceneIndex = null, string[] keywords = null, bool isRemoveLeafNodes = true) { List <ITree> trees = new List <ITree>(); foreach (var brtFile in brtFiles) { //Read tree from file TreeDataParser parser = new TreeDataParser(brtFile, isRemoveLeafNodes); var tree = parser.GetTree(); Trace.WriteLine(tree.GetDepth(tree.Root)); if (luceneIndex != null) { var scheme = TreeNodeScheme.Get(tree.Graph.NodeTable); scheme.SetIndexReader(LuceneOperations.GetIndexReader(luceneIndex)); scheme.SetBRTFileName(brtFile); } trees.Add(tree); } //Print analyze info DoubleStatistics depthStat = new DoubleStatistics(); DoubleStatistics internalNodeStat = new DoubleStatistics(); foreach (var tree in trees) { depthStat.AddNumber(tree.BFS(tree.Root).Max(node => { int depth = 0; INode ancestor = node; while ((ancestor = tree.GetParent(ancestor)) != null) { depth++; } return(depth); }) + 1); internalNodeStat.AddNumber(tree.BFS(tree.Root).Count()); } Console.WriteLine(depthStat.ToString()); Console.WriteLine(internalNodeStat.ToString()); //Visualize tree Thread NetServer = new Thread(new ThreadStart(() => { TreeVisualization treeVis = new TreeVisualization(trees, keywords); })); NetServer.SetApartmentState(ApartmentState.STA); NetServer.IsBackground = true; NetServer.Start(); System.Windows.Threading.Dispatcher.Run(); }
public static void AnalyzeTreeStructure() { //string folder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\Data\KddInfovisGraphicsIndex_Lucene_a=0.003_sm=1-test\"; string folder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\Trees3\"; int index = 0; DoubleStatistics stat = new DoubleStatistics(); while (true) { var fileName = folder + index + ".gv"; if (!File.Exists(fileName)) { break; } var tree = ReadTree(fileName); stat.AddNumber(tree.Graph.NodeCount); Console.WriteLine(tree.Graph.NodeCount); //Console.WriteLine(index + "\t" + stat.ToStringShort()); //Console.Write(stat.GetAverage() + "\t"); index++; } }
public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig) { var indexReader = LuceneOperations.GetIndexReader(inputPath); var docNum = indexReader.NumDocs(); int[] docWordCnt = new int[docNum]; int[] docUniqWordCnt = new int[docNum]; Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>(); Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>(); var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector ? BingNewsFields.FeatureVectorFieldWeights : BingNewsFields.NewsFieldWeights; ProgramProgress progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var document = indexReader.Document(iDoc); var content = LuceneOperations.GetContent(document, fieldWeights); var words = NLPOperations.Tokenize(content, tokenConfig); var uniqueWords = new HashSet <string>(words); docWordCnt[iDoc] = words.Count; docUniqWordCnt[iDoc] = uniqueWords.Count; foreach (var word in uniqueWords) { if (!wordDocCntDict.ContainsKey(word)) { wordDocCntDict.Add(word, 0); } wordDocCntDict[word]++; } foreach (var word in words) { if (!wordOccCntDict.ContainsKey(word)) { wordOccCntDict.Add(word, 0); } wordOccCntDict[word]++; } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); indexReader.Close(); //Statistics DoubleStatistics statDocWordCnt = new DoubleStatistics(); DoubleStatistics statDocUniqWordCnt = new DoubleStatistics(); DoubleStatistics statWordDocCnt = new DoubleStatistics(); DoubleStatistics statWordOccCnt = new DoubleStatistics(); for (int iDoc = 0; iDoc < docNum; iDoc++) { statDocWordCnt.AddNumber(docWordCnt[iDoc]); statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]); } foreach (var kvp in wordDocCntDict) { statWordDocCnt.AddNumber(kvp.Value); } foreach (var kvp in wordOccCntDict) { statWordOccCnt.AddNumber(kvp.Value); } Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt")); Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt")); Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt")); Console.WriteLine(statWordOccCnt.ToString("wordOccCnt")); //Hist var docWordCntHist = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1); var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1); var wordDocCntHist = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000); var wordDocCntHist2 = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1); docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv"); docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv"); wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv"); wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv"); Console.Read(); }
public void AnalyzeResultsTreeCounts() { bool isAddVisTime = true; var folders = new[] { @"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-TreeCountExp2\", //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-supp\", //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-TreeCount\", //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-TreeCountStart19\", //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-treeCount21End\", }; //Dictionary<int, Dictionary<int, DoubleStatistics[]>> statDictionary = new Dictionary<int, Dictionary<int, DoubleStatistics[]>>(); Dictionary <Tuple <int, int, int>, Dictionary <int, double> > statDictionary = new Dictionary <Tuple <int, int, int>, Dictionary <int, double> >(); HashSet <int> copyFactors = new HashSet <int>(); HashSet <int> focusCounts = new HashSet <int>(); HashSet <int> treeCounts = new HashSet <int>(); HashSet <int> focusSeeds = new HashSet <int>(); foreach (var folder in folders) { foreach (var file in Directory.GetFiles(folder)) { var result = new TopicStreamResult(file); var key = Tuple.Create(result.CopyFactor, result.FocusCount, result.TreeCount); Dictionary <int, double> value; if (!statDictionary.TryGetValue(key, out value)) { value = new Dictionary <int, double>(); statDictionary.Add(key, value); } value.Add(result.DefaultTreeCutRandomSeed, (result.TreeCutTime + (isAddVisTime ? result.VisTime : 0.0) + result.DAGTime + result.SedimentationTime)); focusSeeds.Add(result.DefaultTreeCutRandomSeed); copyFactors.Add(result.CopyFactor); focusCounts.Add(result.FocusCount); treeCounts.Add(result.TreeCount); } } if (copyFactors.Count != 1) { throw new ArgumentException(); } var treeCountList = new List <int>(treeCounts); treeCountList.Sort(); var focusCountList = new List <int>(focusCounts); focusCountList.Sort(); var focusSeedList = new List <int>(focusSeeds); focusSeedList.Sort(); var singleCopyFactor = copyFactors.First(); double[,] matrices = new double[treeCountList.Count - 1, focusCountList.Count]; int copyFactorIndex = 0; int minTreeCnt = treeCountList.First(); var deltaTreeCount = treeCountList[1] - treeCountList[0]; foreach (var treeCount in treeCountList) { if (treeCount == minTreeCnt) { continue; } int focusCountIndex = 0; foreach (var focusCount in focusCountList) { var tuple1 = Tuple.Create(singleCopyFactor, focusCount, treeCount - deltaTreeCount); var tuple2 = Tuple.Create(singleCopyFactor, focusCount, treeCount); DoubleStatistics stat = new DoubleStatistics(); var dict1 = statDictionary[tuple1]; var dict2 = statDictionary[tuple2]; //if (dict1.Count != 50 || dict2.Count != 50) //{ // throw new ArgumentException(); //} foreach (var focusSeed in focusSeedList) { double time1, time2; if (dict1.TryGetValue(focusSeed, out time1) & dict2.TryGetValue(focusSeed, out time2)) { stat.AddNumber(Math.Max(time2 - time1, 0)); } } matrices[copyFactorIndex, focusCountIndex] = stat.GetAverage(); focusCountIndex++; } copyFactorIndex++; } Console.Write("["); for (int j = 0; j < treeCountList.Count - 1; j++) { for (int k = 0; k < focusCountList.Count; k++) { Console.Write(matrices[j, k] + (k == focusCountList.Count - 1 ? ";" : ",")); } if (j == treeCountList.Count - 1) { Console.WriteLine("]/1000;"); } else { Console.WriteLine("..."); } } Console.WriteLine(); }
List <int> GetLanguageErrorDocuments(IndexReader indexreader, string outputfile) { Console.WriteLine("==========Remove language error documents!=========="); StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null; List <int> removedDocuments = new List <int>(); var stopWords = IsEnglish ? FileOperations.LoadKeyWordFile(StopWordsFile.EN, true) : FileOperations.LoadKeyWordFile(StopWordsFile.CH, false); var stopHash = Util.GetHashSet(stopWords); int docNum = indexreader.NumDocs(); string titlefield = this.TitleField; string bodyfield = this.BodyField; int removedDocNum = 0; Console.WriteLine("Total documents: {0}", docNum); var tokenConfig = new TokenizeConfig(IsEnglish ? TokenizerType.Standard : TokenizerType.ICTCLAS, StopWordsFile.NO); DoubleStatistics stat_percent = new DoubleStatistics(); DoubleStatistics stat_absolute = new DoubleStatistics(); for (int idoc = 0; idoc < docNum; idoc++) { if (idoc % 10000 == 0) { if (idoc == 0) { continue; } Console.WriteLine("Process " + idoc + "th document!"); Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc); if (IsPrintTextFiles) { sw.Flush(); } } Document document = indexreader.Document(idoc); string content = document.Get(titlefield) + " " + document.Get(bodyfield); if (IsEnglish) { content = content.ToLower(); } var words = NLPOperations.Tokenize(content, tokenConfig); var termCnt0 = words.Count; var termCnt1 = 0; foreach (var word in words) { if (!stopHash.Contains(word)) { termCnt1++; } } if (((double)termCnt0 - termCnt1) / termCnt0 < MinLanguageCorrectRatio) { if (IsPrintTextFiles) { sw.WriteLine(DocumentToString(document)); } removedDocuments.Add(idoc); removedDocNum++; } else { stat_absolute.AddNumber(termCnt0 - termCnt1); stat_percent.AddNumber((100.0) * (termCnt0 - termCnt1) / termCnt0); } } Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); if (IsPrintTextFiles) { sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum); sw.Flush(); sw.Close(); } Console.WriteLine(stat_percent.ToString("stat_percent")); Console.WriteLine(stat_absolute.ToString("stat_absolute")); return(removedDocuments); }