Пример #1
0
        /*
         * delete: content, list of tables, list of figures, appendix, Revision History
         * split as paragraphs
         * filter: number-> "", camel split, lowercase,  more than two empty lines,
         */
        public void CleanDir(string originalText, string cleanedText)
        {
            FileAttributes att = File.GetAttributes(originalText);

            if ((att & FileAttributes.Directory) == FileAttributes.Directory)
            {
                string[] files = Directory.GetFiles(originalText);
                foreach (string filePath in files)
                {
                    string fileName     = FileNameParse.GetFileName(filePath);
                    string destFilePath = cleanedText + "\\" + fileName + ".txt";
                    CleanSingleFile(filePath, destFilePath);
                }
                string[] dirs = Directory.GetDirectories(originalText);
                foreach (string subDir in dirs)
                {
                    string[] subFiles   = Directory.GetFiles(subDir);
                    string   subDirName = FileNameParse.GetFileName(subDir);
                    foreach (string subFile in subFiles)
                    {
                        string fileName     = FileNameParse.GetFileName(subFile);
                        string destFilePath = cleanedText + "\\" + subDirName + "\\" + fileName + ".txt";
                        CleanSingleFile(subFile, destFilePath);
                    }
                }
            }
            else
            {
                //  string fileName = FileNameParse.GetFileName(originalText);
                //  CleanSingleFile(originalText, cleanedText + "\\" + fileName + ".txt");
                CleanSingleFile(originalText, cleanedText);
            }
        }
Пример #2
0
        private void CalDocDensityMap(string[] fileEntities)
        {
            foreach (string fileEntity in fileEntities)
            {
                string fileName = FileNameParse.GetFileName(fileEntity);
                if (!fileNameTopicDensityMap.ContainsKey(fileName))
                {
                    string        fileContent  = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower();
                    char[]        delimiters   = new char[] { ' ' };
                    string[]      tmpList      = fileContent.Split(delimiters, StringSplitOptions.RemoveEmptyEntries);
                    List <string> fileTermList = new List <string>();
                    foreach (string tmpTerm in tmpList)  //collect the terms in file
                    {
                        if (tmpTerm.Length > 1)
                        {
                            fileTermList.Add(tmpTerm);
                        }
                    }
                    if (fileTermList.Count == 0)
                    {
                        break;
                    }
                    Dictionary <string, float> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file

                    fileNameTopicDensityMap.Add(fileName, topicDensity);
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
                else
                {
                    Dictionary <string, float> topicDensity = fileNameTopicDensityMap[fileName];
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
            }
        }
Пример #3
0
 /* store the stemmed file as the original hierarchy
  */
 public void FileDirStemming(string sourceFile, string stemmedPath)
 {
     if (IsDirectory(sourceFile))
     {
         string[] subDirs = Directory.GetDirectories(sourceFile);
         foreach (string subDir in subDirs)
         {
             string subDirName = FileNameParse.GetFileName(subDir);
             string storePath  = stemmedPath + "\\" + subDirName;
             FileDirStemming(subDir, storePath);
         }
         string[] subFiles = Directory.GetFiles(sourceFile);
         foreach (string subFile in subFiles)
         {
             string[] fileLines      = FileOperators.ReadFileLines(subFile);
             string   fileName       = FileNameParse.GetFileName(subFile);
             string   stemmedContent = "";
             foreach (string fileLine in fileLines)
             {
                 string   stemmedLine = "";
                 string[] separators  = { " ", "," };
                 string[] terms       = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 Porter2  porter      = new Porter2();
                 foreach (string term in terms)
                 {
                     string stemmedTerm = porter.stem(term);
                     stemmedLine += stemmedTerm + " ";
                 }
                 stemmedContent += stemmedLine + "\r\n";
             }
             FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent);
         }
     }
 }
Пример #4
0
        private void CleanSingleFile(string sourceFilePath, string destFileName)
        {
            string fileName = FileNameParse.GetFileName(sourceFilePath);

            string[]      oriLines       = FileOperators.ReadFileLines(sourceFilePath);
            List <string> filterDecorate = DeleteDecorate(oriLines);//contents, list of figures, list of tables, appendix

            //SplitParagraph(filterDecorate);

            List <string> cleanedText   = DetailClean(filterDecorate);
            List <string> furtherClean  = CleanMoreSpace(cleanedText);//filter the more consecutive empty space in each line
            List <string> mergedContent = MergeContent(furtherClean);

            string filteredContent = String.Join("\r\n", mergedContent);

            FileOperators.FileWrite(destFileName, filteredContent);
        }
Пример #5
0
        /* intput: the component that the summary is related with.
         * compName: the target component that can be a single component or a component\\subcomponent if the target is a subcomponent
         */
        public void GenerateSummary(string compTextPath, string compName, List <string> subcompTerms, string summaryStore)
        {
            string        compPath           = compTextPath + "\\" + compName;
            List <string> subcompNames       = new List <string>();
            List <string> candidateSentences = SplitSentences(compPath); //split the paragraph into sentences
            string        summary            = "";

            if (IsDirectory(compPath))//if the target component is a 'component'
            {
                //1. create the subcomponent set. read all of the subcomponents! Here we identify the subcomponent from folders. To the subcomponent that the folders don't contain, they must be not in the critical content of components
                //2. write all of the related sentences in one document. calculate the scores of sentences in all of the related documents
                string[] subcomps = Directory.GetFiles(compPath);

                foreach (string subcomp in subcomps)
                {
                    string subcompName = FileNameParse.GetFileName(subcomp);
                    subcompNames.Add(subcompName);
                }
            }
            else //if the target component is a 'subcomponent'. Calculate the scores of the sentences in one file
            {
                string subcompName = FileNameParse.GetFileName(compPath);
                subcompNames.Add(subcompName);
                foreach (string acronym in subcompTerms)
                {
                    subcompNames.Add(acronym);
                }
            }

            MMRSummary aSummary = new MMRSummary();

            summary = aSummary.GenerateSummary(subcompNames, candidateSentences);
            if (!string.IsNullOrEmpty(summary))
            {
                FileOperators.FileWrite(summaryStore, summary);
            }
            else
            {
                Console.WriteLine("summary is empty:" + summary + ":" + compName);
            }
        }
Пример #6
0
        /*
         * oriPDFPath: the pdf documents are stored in a directory
         */
        public void CountDocRelativeSeq(string resultFile, string oriPDFPath)
        {
            //parse component and the related files
            Dictionary <string, List <string> > compFiles = new Dictionary <string, List <string> >();

            foreach (string compDir in Directory.GetDirectories(oriPDFPath))
            {
                string        compName = FileNameParse.GetFileName(compDir);
                List <string> fileList = new List <string>();
                foreach (string filePath in Directory.GetFiles(compDir))
                {
                    string fileName = FileNameParse.GetFileName(filePath);
                    fileList.Add(fileName);
                }
                compFiles.Add(compName, fileList);
            }

            //read the result file
            string resultFileContent = FileOperators.ReadFileText(resultFile);

            string[] separators = new string[] { "\r\n\r\n" };
            string[] compChunks = resultFileContent.Split(separators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string compChunk in compChunks)
            {
                string[] lineSeparator = new string[] { "\r\n" };
                string[] lines         = compChunk.Split(lineSeparator, StringSplitOptions.RemoveEmptyEntries);
                string   firstLine     = lines[0];
                if (firstLine.Contains("\t"))
                {
                    string[] lineTerms = firstLine.Split('\t');
                    string   compName  = lineTerms[0];

                    if (compName.Contains(","))
                    {
                        int      commaIndex     = compName.IndexOf(',');
                        string[] candidateComps = compName.Split(',');
                        int      candidateScale = candidateComps.Count();

                        bool matched        = false;
                        int  candidateIndex = 0;
                        while (!matched && candidateIndex < candidateScale)
                        {
                            string candidate = candidateComps[candidateIndex].Trim();
                            foreach (string fileComp in compFiles.Keys)
                            {
                                if (candidate.Equals(fileComp.ToLower()))
                                {
                                    List <string> targetFiles = compFiles[fileComp];
                                    ReadResultFile(fileComp, lines, targetFiles);
                                    matched = true;
                                    break;
                                }
                            }
                            candidateIndex++;
                        }
                    }
                    else
                    {
                        foreach (string fileComp in compFiles.Keys)
                        {
                            if (fileComp.Equals(compName))
                            {
                                List <string> targetFiles = compFiles[fileComp];
                                ReadResultFile(fileComp, lines, targetFiles);
                                break;
                            }
                        }
                    }
                }
            }
        }
Пример #7
0
        private void tvSummaryResult_NodeMouseDoubleClick(object sender, TreeNodeMouseClickEventArgs e)
        {
            object  pageNumber = e.Node.Tag;
            string  filePath   = tvHighlightingResult.Nodes[0].Text;
            string  fileName   = FileNameParse.GetFileName(filePath);
            Process myProcess  = new Process();

            if (filePath.EndsWith(".pdf"))
            {
                Process[] collectionOfProcess = Process.GetProcessesByName("AcroRd32");
                foreach (Process p in collectionOfProcess)
                {
                    string runningFile = p.MainWindowTitle;
                    if (runningFile.Contains("- Adobe Reader"))
                    {
                        int adobeIndex = runningFile.IndexOf("- Adobe Reader");
                        runningFile = runningFile.Substring(0, adobeIndex - 1);
                    }

                    if (runningFile.Equals(fileName))
                    {
                        p.Kill();
                    }
                }

                try
                {
                    myProcess.StartInfo.FileName  = "AcroRd32.exe";
                    myProcess.StartInfo.Arguments = string.Format("/A \"page={0}\" \"{1}\"", pageNumber, filePath);
                    myProcess.Start();
                }

                catch
                {
                    MessageBox.Show("Failed to open pdf file. We need adobe reader to open the pdf file. Please make sure you have setup Adobe reader.");
                }
            }
            else
            {
                object missing = System.Reflection.Missing.Value;

                int  pageNumValue = Convert.ToInt32(pageNumber);
                bool isActive     = Relocate(filePath, pageNumValue);

                if (!isActive)
                {
                    try
                    {
                        Microsoft.Office.Interop.Word.Application app = new Microsoft.Office.Interop.Word.Application();

                        app.Visible = true;
                        object readOnly = false;

                        var    doc   = app.Documents.Open(filePath, missing, readOnly);
                        object what  = WdGoToItem.wdGoToPage;
                        object which = WdGoToDirection.wdGoToAbsolute;
                        Range  range = app.Selection.GoTo(what, which, pageNumber, missing);
                        doc.Activate();
                        app.Activate();
                        doc.Save();
                    }
                    catch (Exception)
                    {
                    }
                }
            }
        }
Пример #8
0
        private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
        {
            Dictionary <int, string> sortedPageContent = new Dictionary <int, string>();

            sortedPageContent = pageResult.OrderByDescending(x => x.Value.Length).ToDictionary(x => x.Key, x => x.Value);
            if (sortedPageContent.Count == 0)
            {
                return;
            }

            int mostRelevantPage = sortedPageContent.Keys.ElementAt(0);

            string  filePath  = tvHighlightingResult.Nodes[0].Text;
            Process myProcess = new Process();

            if (filePath.EndsWith(".pdf"))
            {
                string    fileName            = FileNameParse.GetFileName(filePath);
                Process[] collectionOfProcess = Process.GetProcessesByName("AcroRd32");
                foreach (Process p in collectionOfProcess)
                {
                    string runningFile = p.MainWindowTitle;
                    if (runningFile.Contains("- Adobe Reader"))
                    {
                        int adobeIndex = runningFile.IndexOf("- Adobe Reader");
                        runningFile = runningFile.Substring(0, adobeIndex - 1);
                    }

                    if (runningFile.Equals(fileName))
                    {
                        p.Kill();
                    }
                }

                try
                {
                    myProcess.StartInfo.FileName  = "AcroRd32.exe";
                    myProcess.StartInfo.Arguments = string.Format("/A \"page={0}\" \"{1}\"", mostRelevantPage, filePath);
                    myProcess.Start();
                }

                catch
                {
                    MessageBox.Show("Failed to open pdf file. We need adobe reader to open the pdf file. Please make sure you have setup Adobe reader.");
                }
            }
            else
            {
                object missing = System.Reflection.Missing.Value;

                int  pageNumValue = Convert.ToInt32(mostRelevantPage);
                bool isActive     = Relocate(filePath, pageNumValue);

                if (!isActive)
                {
                    try
                    {
                        Microsoft.Office.Interop.Word.Application app = new Microsoft.Office.Interop.Word.Application();

                        app.Visible = true;
                        object readOnly = false;

                        var    doc   = app.Documents.Open(filePath, missing, readOnly);
                        object what  = WdGoToItem.wdGoToPage;
                        object which = WdGoToDirection.wdGoToAbsolute;
                        Range  range = app.Selection.GoTo(what, which, mostRelevantPage, missing);
                        doc.Activate();
                        app.Activate();
                        doc.Save();
                    }
                    catch (Exception)
                    {
                    }
                }
            }
        }