Exemplo n.º 1
0
        private static bool RankTopic(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Step 4", "Rank Topic Relative Files");
            //RankDocByClumping docRank = new RankDocByClumping(Constants.TopicLabelFileDir + Constants.TopicManualTermFileName, Constants.DefaultCleanTextFileDir, Constants.TopicLabelFileDir + Constants.TopicManualRelatedFileName);
            //docRank.DoClumpingRank();
            OutputMg.OutputContent(backgroundWorker, "Start to rank topic related files.");
            //RankDocByClumpingImprove docRank = new RankDocByClumpingImprove(FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName, FileMg.ManualCleanTextFileDir, FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName);
            //docRank.DoClumpingRank();

            RankDocByClumpingLessClumps docRank = new RankDocByClumpingLessClumps(FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName, FileMg.ManualCleanTextFileDir, FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName);

            docRank.DoClumpingRank(backgroundWorker);

            Console.WriteLine("output path:" + FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName);
            //TopicDocRank docRank = new TopicDocRank(FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName, FileMg.ManualCleanTextFileDir, FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName);
            //docRank.executeRank();

            //RankingDocByClumpingCaleb docRank = new RankingDocByClumpingCaleb(FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName, FileMg.ManualCleanTextFileDir, FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName);
            //docRank.DoClumpingRank();

            //DocumentRanking.VSMRankDoc.TopicDocRank docRank = new DocumentRanking.VSMRankDoc.TopicDocRank(FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName, FileMg.ManualCleanTextFileDir, FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName);
            //docRank.executeRank();

            if (!File.Exists(FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to rank topic related files.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished ranking topic related files.");

            return(true);
        }
Exemplo n.º 2
0
        /*
         * calculate the density per document
         */
        //find the density of all components in each document.
        //files in two levels
        public void DoClumpingRank(BackgroundWorker backgroundWorker)
        {
            if (File.Exists(rankResult))
            {
                File.Delete(rankResult);
            }

            OutputMg.OutputContent(backgroundWorker, "Start parsing topic terms");
            string topicTermContent = FileOperators.ReadFileText(topicTermsFilePath).ToLower();

            topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent);
            OutputMg.OutputContent(backgroundWorker, "Finished parsing topic terms.");

            OutputMg.OutputContent(backgroundWorker, "Start ranking topic");
            docTopicDensityMap      = new Dictionary <string, Dictionary <string, float> >();
            fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >();

            string[] fileEntities = Directory.GetFiles(txtCleanFileDir);
            CalDocDensityMap(fileEntities);
            string[] dirs = Directory.GetDirectories(txtCleanFileDir);
            foreach (string dir in dirs)
            {
                string[] subFileEntities = Directory.GetFiles(dir);
                //CalDocDensityMap(subFileEntities);
                CalDocDensityMap(subFileEntities);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished ranking topic");
            OutputMg.OutputContent(backgroundWorker, "Start writing ranking topic");
            WriteRankingResult();
            OutputMg.OutputContent(backgroundWorker, "Finished writing ranking topic");
        }
Exemplo n.º 3
0
        private void TryToLoadCompSummary(BackgroundWorker backgroundWorker, string searchTermFilePath, string cleanComponentDir)
        {
            OutputMg.OutputContent(backgroundWorker, "Start to combine terms");
            CombineTerms termCombiner = new CombineTerms();
            Dictionary <string, List <string> > compTerms = termCombiner.GetCombinedTerms(searchTermFilePath, null);

            OutputMg.OutputContent(backgroundWorker, "Finished combining terms");
            OutputMg.OutputContent(backgroundWorker, "Start to generate summary");
            foreach (string comp in compTerms.Keys)
            {
                OutputMg.OutputContent(backgroundWorker, "-- Start to generate summary for " + comp);
                List <string> compTermList = compTerms[comp];

                //get the sentences of components
                string paraFile = cleanComponentDir + comp + ".txt";
                GenerateComponentSummary sentenceSplitter   = new GenerateComponentSummary();
                List <string>            candidateSentences = sentenceSplitter.SplitSingleFileSentence(paraFile);

                MMRSummary summaryGenerator = new MMRSummary();
                string     summary          = summaryGenerator.GenerateSummary(compTermList, candidateSentences);
                mCompSummDictionary.Add(comp, summary);
                OutputMg.OutputContent(backgroundWorker, "-- Finished generating summary for " + comp);
            }
            OutputMg.OutputContent(backgroundWorker, "Finished generating summary");
        }
Exemplo n.º 4
0
        public void GenerateCompSummary(BackgroundWorker backgroundWorker, string searchTermFilePath, string cleanComponentDir, string compName)
        {
            OutputMg.OutputContent(backgroundWorker, "Start to generate summary");
            OutputMg.OutputContent(backgroundWorker, "-- Start to generate summary for " + compName);
            if (mCompTerms == null)
            {
                OutputMg.OutputContent(backgroundWorker, "Start to combine terms");
                CombineTerms termCombiner = new CombineTerms();
                mCompTerms = termCombiner.GetCombinedTerms(searchTermFilePath, null);
                OutputMg.OutputContent(backgroundWorker, "Finished combining terms");
            }

            if (mCompTerms == null || !mCompTerms.ContainsKey(compName) || mCompSummDictionary.ContainsKey(compName))
            {
                return;
            }

            List <string> compTermList = mCompTerms[compName];
            //get the sentences of components
            string paraFile = cleanComponentDir + compName + ".txt";
            GenerateComponentSummary sentenceSplitter   = new GenerateComponentSummary();
            List <string>            candidateSentences = sentenceSplitter.SplitSingleFileSentence(paraFile);

            if (!mCompSummDictionary.ContainsKey(compName))
            {
                MMRSummary summaryGenerator = new MMRSummary();
                string     summary          = summaryGenerator.GenerateSummary(compTermList, candidateSentences);
                mCompSummDictionary.Add(compName, summary);
            }

            OutputMg.OutputContent(backgroundWorker, "-- Finished generating summary for " + compName);
            OutputMg.OutputContent(backgroundWorker, "Finished generating summary");
        }
Exemplo n.º 5
0
        private static bool ClearFolder(BackgroundWorker backgroundWorker)
        {
            try
            {
                FileMg.DirectoryDelete(FileMg.AutoTmtOutputFileDir, true);
                FileMg.DeleteTmtCacheFile(FileMg.AutoTmtDataFileDir);
            }
            catch
            { }

            if (Configures.GetAutoIsDeleteExistingFile())
            {
                OutputMg.OutputHeader1(backgroundWorker, "Step 0", "Clear output folder");
                OutputMg.OutputContent(backgroundWorker, "Start to clear");
                try
                {
                    FileMg.ClearAutoFolder();
                    FileMg.InitDataFolder();
                }
                catch
                {
                    OutputMg.OutputHeader1(backgroundWorker, "Failed", "Clear folder failed. Please try to run this tool as Administrator.");
                    return(false);
                }

                OutputMg.OutputContent(backgroundWorker, "Finished clearing");
            }

            return(true);
        }
Exemplo n.º 6
0
 //searchTerms:C:\Users\xlian\MyPapers\Simmons\qualityRequirements\ProgramImpl\TopicManualTerms.txt
 //compRelatedFile:C:\Users\xlian\MyPapers\Simmons\qualityRequirements\ProgramImpl\TOpicManualRelatedFiles.txt
 public void IdentifyComponentPara(BackgroundWorker backgroundWorker, string searchTerms, string compRelatedFile, string paras, string storeFile)
 {
     OutputMg.OutputContent(backgroundWorker, "Start parsing component terms");
     ParseCompTerms(searchTerms);
     OutputMg.OutputContent(backgroundWorker, "Parsing component terms has been done");
     OutputMg.OutputContent(backgroundWorker, "Start parsing component files");
     ParseCompFiles(compRelatedFile);
     OutputMg.OutputContent(backgroundWorker, "Parsing component file has been done");
     OutputMg.OutputContent(backgroundWorker, "Start extracting component paragraphs");
     ExtractCompParagraphs(compRelatedFile, paras, storeFile);
     OutputMg.OutputContent(backgroundWorker, "Extracting component paragraphs has been done");
 }
Exemplo n.º 7
0
        private static bool ExtractSourceFiles(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Step 2", "Extract Source File");
            OutputMg.OutputContent(backgroundWorker, "Start to count number under " + FileMg.AutoSourceFileDir);
            int sourceFileNumber = FileMg.CountFileNumber(FileMg.AutoSourceFileDir);

            OutputMg.OutputContent(backgroundWorker, "Finished counting. Total file number is: " + sourceFileNumber);
            OutputMg.OutputContent(backgroundWorker, "Start to extract files");
            ExtractMg.ExtractFile(FileMg.AutoSourceFileDir, FileMg.AutoExtractTextFileDir, FileMg.AutoCleanTextFileDir, FileMg.AutoSemiCleanTextFileDir,
                                  FileMg.AutoTmtDataFileDir + Constants.TmtInputFileName, backgroundWorker);
            OutputMg.OutputContent(backgroundWorker, "Finished extracting files");
            return(true);
        }
Exemplo n.º 8
0
        private static bool ExtractComponent(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Step 5", "Extract Component Files");
            OutputMg.OutputContent(backgroundWorker, "Start to extract component files.");
            IdentifyComParagraphs relatedParaExtractor = new IdentifyComParagraphs();

            relatedParaExtractor.IdentifyComponentPara(backgroundWorker,
                                                       FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName,
                                                       FileMg.ManualTopicLabelFileDir + Constants.TopicManualRelatedFileName,
                                                       FileMg.ManualCleanTextFileDir, FileMg.ManualCleanComponentFileDir);
            OutputMg.OutputContent(backgroundWorker, "Finished extracting component files.");
            return(true);
        }
Exemplo n.º 9
0
        public bool PrepareRender(BackgroundWorker backgroundWorker, string modelFilePath, string searchTermFilePath, string cleanComponentDir)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Global", "Start preparing data...");
            OutputMg.OutputHeader1(backgroundWorker, "Step 1", "Try to load source files");
            TryToLoadSourceFile(modelFilePath);
            OutputMg.OutputHeader1(backgroundWorker, "Step 1", "Finished loading source files");
            OutputMg.OutputHeader1(backgroundWorker, "Step 2", "Try to load similarity files");
            TryToLoadSimilarityFile();
            OutputMg.OutputHeader1(backgroundWorker, "Step 2", "Finished loading similarity files");
            OutputMg.OutputHeader1(backgroundWorker, "Step 3", "Try to load component summary files");
            //TryToLoadCompSummary(backgroundWorker, searchTermFilePath, cleanComponentDir);
            OutputMg.OutputHeader1(backgroundWorker, "Step 3", "Finished loading component summary files");

            OutputMg.OutputHeader1(backgroundWorker, "Global", "Finished preparing data...");
            return(true);
        }
Exemplo n.º 10
0
 private static bool CopyRawFiles(BackgroundWorker backgroundWorker, string dirName)
 {
     OutputMg.OutputHeader1(backgroundWorker, "Step 1", "Copy Source File");
     try
     {
         OutputMg.OutputContent(backgroundWorker, "Start to count number under " + dirName);
         int sourceFileNumber = FileMg.CountFileNumber(dirName);
         OutputMg.OutputContent(backgroundWorker, "Finished counting. Total file number is: " + sourceFileNumber);
         OutputMg.OutputContent(backgroundWorker, "Starting copy files");
         int numCopy = FileMg.DirectoryCopy(dirName, FileMg.AutoSourceFileDir, true, false, backgroundWorker);
         OutputMg.OutputContent(backgroundWorker, "Finished copying files. Total file number is: " + numCopy);
         return(true);
     }
     catch
     {
         OutputMg.OutputHeader1(backgroundWorker, "Failed", "Copy files failed. Please try to run this tool as Administrator.");
         return(false);
     }
 }
Exemplo n.º 11
0
        public static bool StartGenerate(BackgroundWorker backgroundWorker, string rawFilePath)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Global", "Start to generate automated data!");
            bool rv = ClearFolder(backgroundWorker) &&
                      CopyRawFiles(backgroundWorker, rawFilePath) &&
                      ExtractSourceFiles(backgroundWorker) &&
                      GenerateTopicWithTmt(backgroundWorker) &&
                      LabelTopic(backgroundWorker) &&
                      RankTopic(backgroundWorker);

            if (rv)
            {
                OutputMg.OutputHeader1(backgroundWorker, "Global", "Finished generating automated model!");
                return(true);
            }
            else
            {
                OutputMg.OutputHeader1(backgroundWorker, "Global", "Failed to generate automated model!");
                return(false);
            }
        }
Exemplo n.º 12
0
        private static bool RankTopic(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Step 5", "Rank Topic Relative Files");
            //RankDocByClumping docRand = new RankDocByClumping(Constants.TopicLabelFileDir + Constants.TopicTermFileName, Constants.DefaultCleanTextFileDir, Constants.TopicLabelFileDir + Constants.TopicRelatedFileName);
            //docRand.DoClumpingRank();
            OutputMg.OutputContent(backgroundWorker, "Start to rank topic related files.");
            //RankDocByClumpingImprove docRank = new RankDocByClumpingImprove(FileMg.AutoTopicLabelFileDir + Constants.TopicTermFileName, FileMg.AutoCleanTextFileDir, FileMg.AutoTopicLabelFileDir + Constants.TopicRelatedFileName);
            //docRank.DoClumpingRank();
            RankDocByClumpingLessClumps docRank = new RankDocByClumpingLessClumps(FileMg.AutoTopicLabelFileDir + Constants.TopicTermFileName, FileMg.AutoCleanTextFileDir, FileMg.AutoTopicLabelFileDir + Constants.TopicRelatedFileName);

            docRank.DoClumpingRank(backgroundWorker);
            if (!File.Exists(FileMg.AutoTopicLabelFileDir + Constants.TopicRelatedFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to rank topic related files.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished ranking topic related files.");

            return(true);
        }
Exemplo n.º 13
0
        private static bool LabelTopic(BackgroundWorker backgroundWorker, string modelFilePath)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Step 3", "Label the Generated Topics");

            if (!Directory.Exists(FileMg.ManualTopicLabelFileDir))
            {
                Directory.CreateDirectory(FileMg.ManualTopicLabelFileDir);
            }

            OutputMg.OutputContent(backgroundWorker, "Start to parse manual model file.");
            PrepareManualModel prepareManualModel = new PrepareManualModel(modelFilePath, FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName);

            prepareManualModel.ParseManualModel();
            if (!File.Exists(FileMg.ManualTopicLabelFileDir + Constants.TopicManualTermFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to parse manual model file, please check the file format.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished parsing manual model file.");

            return(true);
        }
Exemplo n.º 14
0
        private static bool GenerateTopicWithTmt(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Step 3", "Generate Topic with TMT");
            /* 1. Run Tmt Tool */
            OutputMg.OutputContent(backgroundWorker, "Start to run TMT");
            if (!TmtToolMg.RunTmtTool(Configures.GetAutoWizardTopicNumberArray(), Configures.GetAutoWizardMaxIteration()))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to startup TMT. Make sure you have authority to run command.");
                return(false);
            }

            /* 2. Check Tmt output */
            int    maxIter             = Int32.Parse(Configures.GetAutoWizardMaxIteration());
            string termDistZipFilePath = FileMg.AutoTmtOutputFileDir + string.Format(Constants.TmtOutputTopicTermDistZipFilePathTemp, maxIter.ToString("D5"));

            if (!File.Exists(termDistZipFilePath))
            {
                OutputMg.OutputContent(backgroundWorker, "Cannot find the result file of topic modeling.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished generating topic and term distribution.");

            /* 3. Unzip Term distribution */
            OutputMg.OutputContent(backgroundWorker, "Start to unzip term distribution file");
            if (!UnzipToolMg.RunUnzipTool(termDistZipFilePath, FileMg.AutoRDataFileDir) ||
                !File.Exists(FileMg.AutoRDataFileDir + Constants.RInputFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to unzip term distribution file. Make sure you have setup 7-zip.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished unzipping term distribution file.");

            return(true);
        }
Exemplo n.º 15
0
        public Dictionary<int, string> ExecuteHighlight(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputContent(backgroundWorker, "Starting highlight file " + originalFilePath);

            //if the document is open, close it firstly
            System.Diagnostics.Process[] processes = System.Diagnostics.Process.GetProcessesByName("WINWORD");
            if (processes != null)
            {
                if (processes.Length > 0)
                {
                    string targetDocPath = "";
                    int dirIndex = highlightedFilePath.LastIndexOf("\\");
                    if (dirIndex > 0)
                    {
                        targetDocPath = highlightedFilePath.Substring(dirIndex + 2);
                    }
                    foreach (System.Diagnostics.Process process in processes)
                    {
                        string temp = process.MainWindowTitle.ToString();
                        if (temp.Length == 0)
                        {
                            process.Kill();
                        }
                        else if (temp.Contains(targetDocPath))
                        {
                            process.Kill();
                            System.IO.File.Delete(highlightedFilePath);
                        }
                        
                    }
                }
            }

            Dictionary<int, string> pageContents = new Dictionary<int, string>();
            var app = new Microsoft.Office.Interop.Word.Application();

            app.Visible = false;
            object readOnly = false;
            object missing = System.Reflection.Missing.Value;
            var doc = app.Documents.Open(this.originalFilePath, missing, readOnly);

            int pageNum = doc.Content.ComputeStatistics(Microsoft.Office.Interop.Word.WdStatistic.wdStatisticPages); //doc page

            List<string> topicTerms = ReadTargetTopicTerms.ParseTopicTerms(this.topicTermPath, this.targetTopicName);

            AddUserSearchTerms(topicTerms);

            //identify each word
            for (int p = 1; p <= pageNum; p++)
            {
                OutputMg.OutputContent(backgroundWorker, "Parsing page: " + p);

                string pageHighlight = "";

                object what = WdGoToItem.wdGoToPage;
                object which = WdGoToDirection.wdGoToAbsolute;
                object nextPage = p + 1;
                Range startRange;
                Range endRange;

                try
                {
                    startRange = app.Selection.GoTo(ref what, ref which, p, ref missing);
                    endRange = app.Selection.GoTo(what, which, nextPage, missing);
                }
                catch (Exception)
                {
                    doc.Close();
                    app.Quit();
                    MessageBox.Show("This document is locked by author. We cannot execute highlight", "Failed", MessageBoxButtons.OK, MessageBoxIcon.Warning);
                    break;
                }
                if (startRange.Start == endRange.Start)
                {
                    which = WdGoToDirection.wdGoToLast;
                    what = WdGoToItem.wdGoToLine;
                    endRange = app.Selection.GoTo(what, which, nextPage, missing);
                }

                endRange.SetRange(startRange.Start, endRange.End);

                foreach (Paragraph field in endRange.Paragraphs)
                {
                    Range fieldRange = field.Range;
                    string paraText = fieldRange.Text.ToLower();

                    if (paraText.Length == 0)
                    {
                        continue;
                    }
                    else
                    {
                        foreach (string topicTerm in topicTerms)
                        {
                            if (paraText.Contains(topicTerm) || paraText.Contains(topicTerm + "s"))
                            {
                                fieldRange.HighlightColorIndex = WdColorIndex.wdYellow;
                                pageHighlight += paraText + "\t";
                                break;
                            }
                        }
                    }
                }
                pageContents.Add(p, pageHighlight);
            }

            doc.SaveAs2(this.highlightedFilePath);
            doc.Close();
            app.Quit();

            return pageContents;
        }
Exemplo n.º 16
0
        private static int ExecuteExtract(string sourceDirName, string destDirName, string cleanDirName, string semiCleanDirName,
                                          string tmtInputFilePath, BackgroundWorker backgroundWorker = null)
        {
            if (backgroundWorker != null)
            {
                OutputMg.OutputContent(backgroundWorker, "Extrating file from " + sourceDirName + " to " + destDirName);
            }

            // Get the subdirectories for the specified directory.
            DirectoryInfo dir = new DirectoryInfo(sourceDirName);

            if (!dir.Exists)
            {
                throw new DirectoryNotFoundException(
                          "Source directory does not exist or could not be found: "
                          + sourceDirName);
            }

            // If the destination directory doesn't exist, create it.
            if (!Directory.Exists(destDirName))
            {
                Directory.CreateDirectory(destDirName);
            }

            if (!Directory.Exists(cleanDirName))
            {
                Directory.CreateDirectory(cleanDirName);
            }

            // Get the files in the directory and copy them to the new location.
            FileInfo[] files = dir.GetFiles();
            foreach (FileInfo file in files)
            {
                curDealFileIndex++;
                if (backgroundWorker != null)
                {
                    OutputMg.OutputContent(backgroundWorker, "Extrating file " + file.Name, curCleanFileIndex);
                }

                try {
                    if (file.Name.EndsWith(".pdf") || file.Name.EndsWith(".doc") || file.Name.EndsWith(".docx"))
                    {
                        string tempDestFileName  = file.Name + ".txt";
                        string tempDestPath      = System.IO.Path.Combine(destDirName, tempDestFileName);
                        string tempCleanPath     = System.IO.Path.Combine(cleanDirName, tempDestFileName);
                        string tempSemiCleanPath = System.IO.Path.Combine(semiCleanDirName, tempDestFileName);
                        if (!File.Exists(tempDestPath))
                        {
                            if (file.Name.EndsWith(".pdf"))
                            {
                                ExtractPDF.ExecuteExtraction(file.FullName, tempDestPath);
                            }
                            else if (file.Name.EndsWith(".doc") || file.Name.EndsWith(".docx"))
                            {
                                ExtractWord.ExecuteWordExtraction(file.FullName, tempDestPath);
                            }
                            else if (file.Name.EndsWith(".ppt") || file.Name.EndsWith(".pptx"))
                            {
                                ExtractPPT.ExecuteExtraction(file.FullName, tempDestPath);
                            }
                            else if (file.Name.EndsWith(".xls") || file.Name.EndsWith(".xlsx"))
                            {
                                ExtractExcel.ExecuteExtraction(file.FullName, tempDestPath);
                            }
                        }

                        //clean
                        ExtractContent.Preprocess.SemiClean cleaner = new Preprocess.SemiClean();
                        cleaner.CleanDir(tempDestPath, tempCleanPath);

                        //generate tmtInputFile.csv
                        GenerateTmtInputFile(tempCleanPath, tmtInputFilePath);


                        if (backgroundWorker != null)
                        {
                            OutputMg.OutputContent(backgroundWorker, "Successfuly Extrated file " + file.Name, curDealFileIndex);
                        }
                    }
                }
                catch (Exception ex)
                {
                    OutputMg.OutputContent(backgroundWorker, "Failed to extract file " + file.Name + " with Exception: " + ex.Message, curDealFileIndex);
                }
            }

            DirectoryInfo[] dirs = dir.GetDirectories();
            foreach (DirectoryInfo subdir in dirs)
            {
                string tempDestPath      = System.IO.Path.Combine(destDirName, subdir.Name);
                string tempCleanPath     = System.IO.Path.Combine(cleanDirName, subdir.Name);
                string tempSemiCleanPath = System.IO.Path.Combine(semiCleanDirName, subdir.Name);
                ExecuteExtract(subdir.FullName, tempDestPath, tempCleanPath, tempSemiCleanPath, tmtInputFilePath, backgroundWorker);
            }

            return(curDealFileIndex);
        }
Exemplo n.º 17
0
        private static bool LabelTopic(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputHeader1(backgroundWorker, "Step 4", "Label Topic");
            /* 0. Create file directory */
            if (!Directory.Exists(FileMg.AutoTopicLabelFileDir))
            {
                Directory.CreateDirectory(FileMg.AutoTopicLabelFileDir);
            }

            /* 1. Get minimum topic number */
            OutputMg.OutputContent(backgroundWorker, "Getting minimum topic number.");
            int minmumTopicNumber = PrepareTopicFile.GetMinimumTopicNumber();
            int maxIteration      = Int32.Parse(Configures.GetAutoWizardMaxIteration());

            if (minmumTopicNumber == -1)
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to get minimum topic number. Please check whether you run TMT successfully.");
                return(false);
            }

            /* 2. Run TMT */
            OutputMg.OutputContent(backgroundWorker, "Minimum topic number is " + minmumTopicNumber);
            string summaryFilePath   = FileMg.AutoTmtOutputFileDir + string.Format(Constants.TmtOutputSummaryFilePathTemp, maxIteration.ToString("D5"));
            string topicTermFilePath = FileMg.AutoTopicLabelFileDir + Constants.TopicTermFileName;

            OutputMg.OutputContent(backgroundWorker, "Start to generate topic terms file.");
            PrepareTopicFile.Execute(summaryFilePath, topicTermFilePath);
            //LabelDomainTopic.Execute(Constants.DefaultSourceFileDir, Constants.TopicLabelFileDir + Constants.TopicTermFileName, summaryFilePath, Constants.TopicLabelFileDir + Constants.TopicLabelFileName);
            if (!File.Exists(topicTermFilePath))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to generate topic terms file.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished generating topic terms file.");

            /* 3. Run JNSP */
            OutputMg.OutputContent(backgroundWorker, "Start to run JNSP tool.");
            if (!Directory.Exists(FileMg.AutoJNSPDataFileDir))
            {
                Directory.CreateDirectory(FileMg.AutoJNSPDataFileDir);
            }

            JNSPToolMg.RunJNSPTool();
            string jnspOutputFileName = FileMg.AutoJNSPDataFileDir + Constants.JnspOptionCNTFileName + Constants.JnspOptionWindowNumber + ".cnt";

            if (!File.Exists(jnspOutputFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to run JNSP tool.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished running JNSP tool.");

            /* 4. Label topic */
            OutputMg.OutputContent(backgroundWorker, "Start to label topic.");
            KDDLabel kddLabel = new KDDLabel(jnspOutputFileName, topicTermFilePath, summaryFilePath, FileMg.AutoTopicLabelFileDir + Constants.TopicLabelFileName);

            kddLabel.GenerateTopicLabel();
            if (!File.Exists(FileMg.AutoTopicLabelFileDir + Constants.TopicLabelFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to label topic.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished labeling topic");

            /* 5. Generate Similarity */
            OutputMg.OutputContent(backgroundWorker, "Start to generate topic similarity.");
            TopicSim.CalTopicSimilarity(topicTermFilePath, FileMg.AutoTopicLabelFileDir + Constants.TopicSimilarityFileName);
            if (!File.Exists(FileMg.AutoTopicLabelFileDir + Constants.TopicLabelFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to generate topic similarity.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished generating topic similarity");
            /* 6. Running R Tool */
            OutputMg.OutputContent(backgroundWorker, "Start to generate BiTree with R tool.");
            RToolMg.RunRTool();
            if (!File.Exists(FileMg.AutoRDataFileDir + Constants.ROutputFileName))
            {
                OutputMg.OutputContent(backgroundWorker, "Failed to generate BiTree with R tool. Please make sure you have setup R tool.");
                return(false);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished generating BiTree with R tool");

            return(true);
        }
Exemplo n.º 18
0
        /*copy the subdirectories and the related files*/
        public static int ToolDirectoryCopy(string sourceDirName, string destDirName, bool copySubDirs, bool createDirs,
                                            BackgroundWorker backgroundWorker = null, int currentFileIndex = 0)
        {
            if (backgroundWorker != null)
            {
                OutputMg.OutputContent(backgroundWorker, "Copying file from " + sourceDirName + " to " + destDirName);
            }

            // Get the subdirectories for the specified directory.
            DirectoryInfo dir = new DirectoryInfo(sourceDirName);

            if (!dir.Exists)
            {
                throw new DirectoryNotFoundException(
                          "Source directory does not exist or could not be found: "
                          + sourceDirName);
            }

            DirectoryInfo[] dirs = dir.GetDirectories();
            // If the destination directory doesn't exist, create it.
            if (!Directory.Exists(destDirName))
            {
                Directory.CreateDirectory(destDirName);
            }

            // Get the files in the directory and copy them to the new location.
            FileInfo[] files = dir.GetFiles();
            foreach (FileInfo file in files)
            {
                try
                {
                    currentFileIndex++;
                    OutputMg.OutputContent(backgroundWorker, "Copying file " + file.Name, currentFileIndex);
                    string temppath = Path.Combine(destDirName, file.Name);
                    if (!File.Exists(temppath))
                    {
                        file.CopyTo(temppath, false);
                    }

                    //if (backgroundWorker != null)
                    //{
                    //    OutputMg.OutputContent(backgroundWorker, "Copying file " + file.Name, currentFileIndex);
                    //}
                }
                catch (Exception ex)
                {
                    if (backgroundWorker != null)
                    {
                        OutputMg.OutputContent(backgroundWorker, "Copying file failed with exception: " + ex.Message);
                    }
                }
            }

            // If copying subdirectories, copy them and their contents to new location.
            if (copySubDirs)
            {
                foreach (DirectoryInfo subdir in dirs)
                {
                    string temppath = Path.Combine(destDirName, subdir.Name);
                    currentFileIndex = DirectoryCopy(subdir.FullName, temppath, copySubDirs, createDirs, backgroundWorker, currentFileIndex);
                }
            }

            return(currentFileIndex);
        }
Exemplo n.º 19
0
        public Dictionary <int, string> ExecuteHighlight(BackgroundWorker backgroundWorker)
        {
            OutputMg.OutputContent(backgroundWorker, "Starting highlight file " + pdfFilePath);

            List <string> topicTerms = ReadTargetTopicTerms.ParseTopicTerms(this.topicTermPath, this.targetTopicName);

            AddUserSearchTerms(topicTerms);

            string origiFile = pdfFilePath;

            //Create a new file from our test file with highlighting
            string highLightFile = highlightedPDFPath;

            int pdfNum = 0;

            PdfReader reader = new PdfReader(origiFile);

            using (FileStream fs = new FileStream(highLightFile, FileMode.Create, FileAccess.Write, FileShare.None))
            {
                using (PdfStamper stamper = new PdfStamper(reader, fs))
                {
                    using (var r = new PdfReader(origiFile))
                    {
                        pdfNum = r.NumberOfPages;
                        string ex = "";
                        ITextExtractionStrategy strategy;

                        for (int i = 1; i <= pdfNum; i++)
                        {
                            OutputMg.OutputContent(backgroundWorker, "Parsing page: " + i);

                            Rectangle pageRect = r.GetPageSize(i);

                            Document doc = new Document(pageRect);

                            float leftMargin  = doc.LeftMargin;
                            float rightMargin = doc.RightMargin;
                            float lineWidth   = pageRect.Width;

                            var textPos = new FutherLocationTextExtractionStrategy(topicTerms);

                            //Create an instance of our strategy
                            ex = PdfTextExtractor.GetTextFromPage(r, i, textPos); //store the text and the position info in textPos
                            List <iTextSharp.text.Rectangle> quadList = new List <iTextSharp.text.Rectangle>();

                            foreach (var p in textPos.myPoints)
                            {
                                string p_text = p.Text;

                                iTextSharp.text.Rectangle rect = p.Rect;


                                quadList.Add(rect);//collect the coordination of keywords
                            }

                            List <string> pageContent = new List <string>();

                            if (quadList.Count > 0)
                            {
                                List <iTextSharp.text.Rectangle> orderedRect = orderRectByBottom(quadList);
                                //merge and adjust the rectangle, highlight the adjusted rect
                                List <iTextSharp.text.Rectangle> adjustedRect = adjustRect(orderedRect, lineWidth, leftMargin);
                                foreach (Rectangle rect in adjustedRect)
                                {
                                    //Create an array of quad points based on that rectangle. NOTE: The order below doesn't appear to match the actual spec but is what Acrobat produces
                                    //the co-ordination of four points
                                    float[] quad = { rect.Left, rect.Bottom, rect.Right, rect.Bottom, rect.Left, rect.Top, rect.Right, rect.Top };

                                    ////Create our hightlight
                                    PdfAnnotation highlight = PdfAnnotation.CreateMarkup(stamper.Writer, rect, null, PdfAnnotation.MARKUP_HIGHLIGHT, quad);

                                    ////Set the color
                                    highlight.Color = BaseColor.YELLOW;

                                    stamper.AddAnnotation(highlight, i); // i is the page

                                    //get the text of highlighting
                                    RenderFilter[] filter = { new RegionTextRenderFilter(rect) };
                                    strategy = new MyFilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                                    string text = PdfTextExtractor.GetTextFromPage(reader, i, strategy).Trim();
                                    if (!pageContent.Contains(text))
                                    {
                                        pageContent.Add(text);
                                    }
                                }
                                StringBuilder sb = new StringBuilder();

                                foreach (string tmp in pageContent)
                                {
                                    sb.AppendLine(tmp);
                                }

                                pageContents.Add(i, sb.ToString());
                            }
                        }
                    }
                }
            }

            return(pageContents);
        }