Exemple #1
0
        public static void VisualizeTree(IEnumerable <string> brtFiles, string luceneIndex = null, string[] keywords = null, bool isRemoveLeafNodes = true)
        {
            List <ITree> trees = new List <ITree>();

            foreach (var brtFile in brtFiles)
            {
                //Read tree from file
                TreeDataParser parser = new TreeDataParser(brtFile, isRemoveLeafNodes);
                var            tree   = parser.GetTree();
                Trace.WriteLine(tree.GetDepth(tree.Root));
                if (luceneIndex != null)
                {
                    var scheme = TreeNodeScheme.Get(tree.Graph.NodeTable);
                    scheme.SetIndexReader(LuceneOperations.GetIndexReader(luceneIndex));
                    scheme.SetBRTFileName(brtFile);
                }
                trees.Add(tree);
            }

            //Print analyze info
            DoubleStatistics depthStat        = new DoubleStatistics();
            DoubleStatistics internalNodeStat = new DoubleStatistics();

            foreach (var tree in trees)
            {
                depthStat.AddNumber(tree.BFS(tree.Root).Max(node =>
                {
                    int depth      = 0;
                    INode ancestor = node;
                    while ((ancestor = tree.GetParent(ancestor)) != null)
                    {
                        depth++;
                    }
                    return(depth);
                }) + 1);
                internalNodeStat.AddNumber(tree.BFS(tree.Root).Count());
            }
            Console.WriteLine(depthStat.ToString());
            Console.WriteLine(internalNodeStat.ToString());

            //Visualize tree
            Thread NetServer = new Thread(new ThreadStart(() =>
            {
                TreeVisualization treeVis = new TreeVisualization(trees, keywords);
            }));

            NetServer.SetApartmentState(ApartmentState.STA);
            NetServer.IsBackground = true;
            NetServer.Start();
            System.Windows.Threading.Dispatcher.Run();
        }
Exemple #2
0
        public static void AnalyzeTreeStructure()
        {
            //string folder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\Data\KddInfovisGraphicsIndex_Lucene_a=0.003_sm=1-test\";
            string           folder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\Trees3\";
            int              index  = 0;
            DoubleStatistics stat   = new DoubleStatistics();

            while (true)
            {
                var fileName = folder + index + ".gv";
                if (!File.Exists(fileName))
                {
                    break;
                }
                var tree = ReadTree(fileName);
                stat.AddNumber(tree.Graph.NodeCount);
                Console.WriteLine(tree.Graph.NodeCount);
                //Console.WriteLine(index + "\t" + stat.ToStringShort());
                //Console.Write(stat.GetAverage() + "\t");
                index++;
            }
        }
        public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig)
        {
            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var docNum      = indexReader.NumDocs();

            int[] docWordCnt     = new int[docNum];
            int[] docUniqWordCnt = new int[docNum];
            Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>();
            Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>();

            var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector
                ? BingNewsFields.FeatureVectorFieldWeights
                : BingNewsFields.NewsFieldWeights;

            ProgramProgress progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var document = indexReader.Document(iDoc);
                var content  = LuceneOperations.GetContent(document, fieldWeights);

                var words       = NLPOperations.Tokenize(content, tokenConfig);
                var uniqueWords = new HashSet <string>(words);
                docWordCnt[iDoc]     = words.Count;
                docUniqWordCnt[iDoc] = uniqueWords.Count;

                foreach (var word in uniqueWords)
                {
                    if (!wordDocCntDict.ContainsKey(word))
                    {
                        wordDocCntDict.Add(word, 0);
                    }
                    wordDocCntDict[word]++;
                }

                foreach (var word in words)
                {
                    if (!wordOccCntDict.ContainsKey(word))
                    {
                        wordOccCntDict.Add(word, 0);
                    }
                    wordOccCntDict[word]++;
                }

                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            indexReader.Close();

            //Statistics
            DoubleStatistics statDocWordCnt     = new DoubleStatistics();
            DoubleStatistics statDocUniqWordCnt = new DoubleStatistics();
            DoubleStatistics statWordDocCnt     = new DoubleStatistics();
            DoubleStatistics statWordOccCnt     = new DoubleStatistics();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                statDocWordCnt.AddNumber(docWordCnt[iDoc]);
                statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]);
            }

            foreach (var kvp in wordDocCntDict)
            {
                statWordDocCnt.AddNumber(kvp.Value);
            }

            foreach (var kvp in wordOccCntDict)
            {
                statWordOccCnt.AddNumber(kvp.Value);
            }


            Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt"));
            Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt"));
            Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt"));
            Console.WriteLine(statWordOccCnt.ToString("wordOccCnt"));

            //Hist
            var docWordCntHist       = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1);
            var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1);
            var wordDocCntHist       = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000);
            var wordDocCntHist2      = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1);

            docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv");
            docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv");
            wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv");
            wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv");

            Console.Read();
        }
Exemple #4
0
        public void AnalyzeResultsTreeCounts()
        {
            bool isAddVisTime = true;
            var  folders      = new[]
            {
                @"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-TreeCountExp2\",
                //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-supp\",
                //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-TreeCount\",
                //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-TreeCountStart19\",
                //@"D:\Project\StreamingRoseRiver\ScalabilityExperiment\Results\RunTimeExperiment-treeCount21End\",
            };
            //Dictionary<int, Dictionary<int, DoubleStatistics[]>> statDictionary = new Dictionary<int, Dictionary<int, DoubleStatistics[]>>();
            Dictionary <Tuple <int, int, int>, Dictionary <int, double> > statDictionary =
                new Dictionary <Tuple <int, int, int>, Dictionary <int, double> >();

            HashSet <int> copyFactors = new HashSet <int>();
            HashSet <int> focusCounts = new HashSet <int>();
            HashSet <int> treeCounts  = new HashSet <int>();
            HashSet <int> focusSeeds  = new HashSet <int>();

            foreach (var folder in folders)
            {
                foreach (var file in Directory.GetFiles(folder))
                {
                    var result = new TopicStreamResult(file);

                    var key = Tuple.Create(result.CopyFactor, result.FocusCount, result.TreeCount);
                    Dictionary <int, double> value;
                    if (!statDictionary.TryGetValue(key, out value))
                    {
                        value = new Dictionary <int, double>();
                        statDictionary.Add(key, value);
                    }
                    value.Add(result.DefaultTreeCutRandomSeed,
                              (result.TreeCutTime + (isAddVisTime ? result.VisTime : 0.0) + result.DAGTime +
                               result.SedimentationTime));
                    focusSeeds.Add(result.DefaultTreeCutRandomSeed);

                    copyFactors.Add(result.CopyFactor);
                    focusCounts.Add(result.FocusCount);
                    treeCounts.Add(result.TreeCount);
                }
            }

            if (copyFactors.Count != 1)
            {
                throw new ArgumentException();
            }
            var treeCountList = new List <int>(treeCounts);

            treeCountList.Sort();
            var focusCountList = new List <int>(focusCounts);

            focusCountList.Sort();
            var focusSeedList = new List <int>(focusSeeds);

            focusSeedList.Sort();
            var singleCopyFactor = copyFactors.First();

            double[,] matrices = new double[treeCountList.Count - 1, focusCountList.Count];
            int copyFactorIndex = 0;
            int minTreeCnt      = treeCountList.First();
            var deltaTreeCount  = treeCountList[1] - treeCountList[0];

            foreach (var treeCount in treeCountList)
            {
                if (treeCount == minTreeCnt)
                {
                    continue;
                }
                int focusCountIndex = 0;
                foreach (var focusCount in focusCountList)
                {
                    var tuple1 = Tuple.Create(singleCopyFactor, focusCount, treeCount - deltaTreeCount);
                    var tuple2 = Tuple.Create(singleCopyFactor, focusCount, treeCount);

                    DoubleStatistics stat = new DoubleStatistics();
                    var dict1             = statDictionary[tuple1];
                    var dict2             = statDictionary[tuple2];

                    //if (dict1.Count != 50 || dict2.Count != 50)
                    //{
                    //    throw new ArgumentException();
                    //}

                    foreach (var focusSeed in focusSeedList)
                    {
                        double time1, time2;
                        if (dict1.TryGetValue(focusSeed, out time1) & dict2.TryGetValue(focusSeed, out time2))
                        {
                            stat.AddNumber(Math.Max(time2 - time1, 0));
                        }
                    }
                    matrices[copyFactorIndex, focusCountIndex] = stat.GetAverage();
                    focusCountIndex++;
                }
                copyFactorIndex++;
            }

            Console.Write("[");
            for (int j = 0; j < treeCountList.Count - 1; j++)
            {
                for (int k = 0; k < focusCountList.Count; k++)
                {
                    Console.Write(matrices[j, k] + (k == focusCountList.Count - 1 ? ";" : ","));
                }
                if (j == treeCountList.Count - 1)
                {
                    Console.WriteLine("]/1000;");
                }
                else
                {
                    Console.WriteLine("...");
                }
            }
            Console.WriteLine();
        }
        List <int> GetLanguageErrorDocuments(IndexReader indexreader, string outputfile)
        {
            Console.WriteLine("==========Remove language error documents!==========");

            StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null;
            List <int>   removedDocuments = new List <int>();
            var          stopWords        = IsEnglish ?
                                            FileOperations.LoadKeyWordFile(StopWordsFile.EN, true) :
                                            FileOperations.LoadKeyWordFile(StopWordsFile.CH, false);
            var stopHash = Util.GetHashSet(stopWords);

            int    docNum        = indexreader.NumDocs();
            string titlefield    = this.TitleField;
            string bodyfield     = this.BodyField;
            int    removedDocNum = 0;

            Console.WriteLine("Total documents: {0}", docNum);

            var tokenConfig = new TokenizeConfig(IsEnglish ? TokenizerType.Standard : TokenizerType.ICTCLAS, StopWordsFile.NO);
            DoubleStatistics stat_percent  = new DoubleStatistics();
            DoubleStatistics stat_absolute = new DoubleStatistics();

            for (int idoc = 0; idoc < docNum; idoc++)
            {
                if (idoc % 10000 == 0)
                {
                    if (idoc == 0)
                    {
                        continue;
                    }
                    Console.WriteLine("Process " + idoc + "th document!");
                    Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc);
                    if (IsPrintTextFiles)
                    {
                        sw.Flush();
                    }
                }

                Document document = indexreader.Document(idoc);

                string content = document.Get(titlefield) + " " + document.Get(bodyfield);
                if (IsEnglish)
                {
                    content = content.ToLower();
                }
                var words    = NLPOperations.Tokenize(content, tokenConfig);
                var termCnt0 = words.Count;
                var termCnt1 = 0;
                foreach (var word in words)
                {
                    if (!stopHash.Contains(word))
                    {
                        termCnt1++;
                    }
                }

                if (((double)termCnt0 - termCnt1) / termCnt0 < MinLanguageCorrectRatio)
                {
                    if (IsPrintTextFiles)
                    {
                        sw.WriteLine(DocumentToString(document));
                    }
                    removedDocuments.Add(idoc);
                    removedDocNum++;
                }
                else
                {
                    stat_absolute.AddNumber(termCnt0 - termCnt1);
                    stat_percent.AddNumber((100.0) * (termCnt0 - termCnt1) / termCnt0);
                }
            }

            Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum);
            if (IsPrintTextFiles)
            {
                sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum);
                sw.Flush();
                sw.Close();
            }

            Console.WriteLine(stat_percent.ToString("stat_percent"));
            Console.WriteLine(stat_absolute.ToString("stat_absolute"));

            return(removedDocuments);
        }