/* * delete: content, list of tables, list of figures, appendix, Revision History * split as paragraphs * filter: number-> "", camel split, lowercase, more than two empty lines, */ public void CleanDir(string originalText, string cleanedText) { FileAttributes att = File.GetAttributes(originalText); if ((att & FileAttributes.Directory) == FileAttributes.Directory) { string[] files = Directory.GetFiles(originalText); foreach (string filePath in files) { string fileName = FileNameParse.GetFileName(filePath); string destFilePath = cleanedText + "\\" + fileName + ".txt"; CleanSingleFile(filePath, destFilePath); } string[] dirs = Directory.GetDirectories(originalText); foreach (string subDir in dirs) { string[] subFiles = Directory.GetFiles(subDir); string subDirName = FileNameParse.GetFileName(subDir); foreach (string subFile in subFiles) { string fileName = FileNameParse.GetFileName(subFile); string destFilePath = cleanedText + "\\" + subDirName + "\\" + fileName + ".txt"; CleanSingleFile(subFile, destFilePath); } } } else { // string fileName = FileNameParse.GetFileName(originalText); // CleanSingleFile(originalText, cleanedText + "\\" + fileName + ".txt"); CleanSingleFile(originalText, cleanedText); } }
private void CalDocDensityMap(string[] fileEntities) { foreach (string fileEntity in fileEntities) { string fileName = FileNameParse.GetFileName(fileEntity); if (!fileNameTopicDensityMap.ContainsKey(fileName)) { string fileContent = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower(); char[] delimiters = new char[] { ' ' }; string[] tmpList = fileContent.Split(delimiters, StringSplitOptions.RemoveEmptyEntries); List <string> fileTermList = new List <string>(); foreach (string tmpTerm in tmpList) //collect the terms in file { if (tmpTerm.Length > 1) { fileTermList.Add(tmpTerm); } } if (fileTermList.Count == 0) { break; } Dictionary <string, float> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file fileNameTopicDensityMap.Add(fileName, topicDensity); docTopicDensityMap.Add(fileEntity, topicDensity); } else { Dictionary <string, float> topicDensity = fileNameTopicDensityMap[fileName]; docTopicDensityMap.Add(fileEntity, topicDensity); } } }
/* store the stemmed file as the original hierarchy */ public void FileDirStemming(string sourceFile, string stemmedPath) { if (IsDirectory(sourceFile)) { string[] subDirs = Directory.GetDirectories(sourceFile); foreach (string subDir in subDirs) { string subDirName = FileNameParse.GetFileName(subDir); string storePath = stemmedPath + "\\" + subDirName; FileDirStemming(subDir, storePath); } string[] subFiles = Directory.GetFiles(sourceFile); foreach (string subFile in subFiles) { string[] fileLines = FileOperators.ReadFileLines(subFile); string fileName = FileNameParse.GetFileName(subFile); string stemmedContent = ""; foreach (string fileLine in fileLines) { string stemmedLine = ""; string[] separators = { " ", "," }; string[] terms = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries); Porter2 porter = new Porter2(); foreach (string term in terms) { string stemmedTerm = porter.stem(term); stemmedLine += stemmedTerm + " "; } stemmedContent += stemmedLine + "\r\n"; } FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent); } } }
private void CleanSingleFile(string sourceFilePath, string destFileName) { string fileName = FileNameParse.GetFileName(sourceFilePath); string[] oriLines = FileOperators.ReadFileLines(sourceFilePath); List <string> filterDecorate = DeleteDecorate(oriLines);//contents, list of figures, list of tables, appendix //SplitParagraph(filterDecorate); List <string> cleanedText = DetailClean(filterDecorate); List <string> furtherClean = CleanMoreSpace(cleanedText);//filter the more consecutive empty space in each line List <string> mergedContent = MergeContent(furtherClean); string filteredContent = String.Join("\r\n", mergedContent); FileOperators.FileWrite(destFileName, filteredContent); }
/* intput: the component that the summary is related with. * compName: the target component that can be a single component or a component\\subcomponent if the target is a subcomponent */ public void GenerateSummary(string compTextPath, string compName, List <string> subcompTerms, string summaryStore) { string compPath = compTextPath + "\\" + compName; List <string> subcompNames = new List <string>(); List <string> candidateSentences = SplitSentences(compPath); //split the paragraph into sentences string summary = ""; if (IsDirectory(compPath))//if the target component is a 'component' { //1. create the subcomponent set. read all of the subcomponents! Here we identify the subcomponent from folders. To the subcomponent that the folders don't contain, they must be not in the critical content of components //2. write all of the related sentences in one document. calculate the scores of sentences in all of the related documents string[] subcomps = Directory.GetFiles(compPath); foreach (string subcomp in subcomps) { string subcompName = FileNameParse.GetFileName(subcomp); subcompNames.Add(subcompName); } } else //if the target component is a 'subcomponent'. Calculate the scores of the sentences in one file { string subcompName = FileNameParse.GetFileName(compPath); subcompNames.Add(subcompName); foreach (string acronym in subcompTerms) { subcompNames.Add(acronym); } } MMRSummary aSummary = new MMRSummary(); summary = aSummary.GenerateSummary(subcompNames, candidateSentences); if (!string.IsNullOrEmpty(summary)) { FileOperators.FileWrite(summaryStore, summary); } else { Console.WriteLine("summary is empty:" + summary + ":" + compName); } }
/* * oriPDFPath: the pdf documents are stored in a directory */ public void CountDocRelativeSeq(string resultFile, string oriPDFPath) { //parse component and the related files Dictionary <string, List <string> > compFiles = new Dictionary <string, List <string> >(); foreach (string compDir in Directory.GetDirectories(oriPDFPath)) { string compName = FileNameParse.GetFileName(compDir); List <string> fileList = new List <string>(); foreach (string filePath in Directory.GetFiles(compDir)) { string fileName = FileNameParse.GetFileName(filePath); fileList.Add(fileName); } compFiles.Add(compName, fileList); } //read the result file string resultFileContent = FileOperators.ReadFileText(resultFile); string[] separators = new string[] { "\r\n\r\n" }; string[] compChunks = resultFileContent.Split(separators, StringSplitOptions.RemoveEmptyEntries); foreach (string compChunk in compChunks) { string[] lineSeparator = new string[] { "\r\n" }; string[] lines = compChunk.Split(lineSeparator, StringSplitOptions.RemoveEmptyEntries); string firstLine = lines[0]; if (firstLine.Contains("\t")) { string[] lineTerms = firstLine.Split('\t'); string compName = lineTerms[0]; if (compName.Contains(",")) { int commaIndex = compName.IndexOf(','); string[] candidateComps = compName.Split(','); int candidateScale = candidateComps.Count(); bool matched = false; int candidateIndex = 0; while (!matched && candidateIndex < candidateScale) { string candidate = candidateComps[candidateIndex].Trim(); foreach (string fileComp in compFiles.Keys) { if (candidate.Equals(fileComp.ToLower())) { List <string> targetFiles = compFiles[fileComp]; ReadResultFile(fileComp, lines, targetFiles); matched = true; break; } } candidateIndex++; } } else { foreach (string fileComp in compFiles.Keys) { if (fileComp.Equals(compName)) { List <string> targetFiles = compFiles[fileComp]; ReadResultFile(fileComp, lines, targetFiles); break; } } } } } }
private void tvSummaryResult_NodeMouseDoubleClick(object sender, TreeNodeMouseClickEventArgs e) { object pageNumber = e.Node.Tag; string filePath = tvHighlightingResult.Nodes[0].Text; string fileName = FileNameParse.GetFileName(filePath); Process myProcess = new Process(); if (filePath.EndsWith(".pdf")) { Process[] collectionOfProcess = Process.GetProcessesByName("AcroRd32"); foreach (Process p in collectionOfProcess) { string runningFile = p.MainWindowTitle; if (runningFile.Contains("- Adobe Reader")) { int adobeIndex = runningFile.IndexOf("- Adobe Reader"); runningFile = runningFile.Substring(0, adobeIndex - 1); } if (runningFile.Equals(fileName)) { p.Kill(); } } try { myProcess.StartInfo.FileName = "AcroRd32.exe"; myProcess.StartInfo.Arguments = string.Format("/A \"page={0}\" \"{1}\"", pageNumber, filePath); myProcess.Start(); } catch { MessageBox.Show("Failed to open pdf file. We need adobe reader to open the pdf file. Please make sure you have setup Adobe reader."); } } else { object missing = System.Reflection.Missing.Value; int pageNumValue = Convert.ToInt32(pageNumber); bool isActive = Relocate(filePath, pageNumValue); if (!isActive) { try { Microsoft.Office.Interop.Word.Application app = new Microsoft.Office.Interop.Word.Application(); app.Visible = true; object readOnly = false; var doc = app.Documents.Open(filePath, missing, readOnly); object what = WdGoToItem.wdGoToPage; object which = WdGoToDirection.wdGoToAbsolute; Range range = app.Selection.GoTo(what, which, pageNumber, missing); doc.Activate(); app.Activate(); doc.Save(); } catch (Exception) { } } } }
private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e) { Dictionary <int, string> sortedPageContent = new Dictionary <int, string>(); sortedPageContent = pageResult.OrderByDescending(x => x.Value.Length).ToDictionary(x => x.Key, x => x.Value); if (sortedPageContent.Count == 0) { return; } int mostRelevantPage = sortedPageContent.Keys.ElementAt(0); string filePath = tvHighlightingResult.Nodes[0].Text; Process myProcess = new Process(); if (filePath.EndsWith(".pdf")) { string fileName = FileNameParse.GetFileName(filePath); Process[] collectionOfProcess = Process.GetProcessesByName("AcroRd32"); foreach (Process p in collectionOfProcess) { string runningFile = p.MainWindowTitle; if (runningFile.Contains("- Adobe Reader")) { int adobeIndex = runningFile.IndexOf("- Adobe Reader"); runningFile = runningFile.Substring(0, adobeIndex - 1); } if (runningFile.Equals(fileName)) { p.Kill(); } } try { myProcess.StartInfo.FileName = "AcroRd32.exe"; myProcess.StartInfo.Arguments = string.Format("/A \"page={0}\" \"{1}\"", mostRelevantPage, filePath); myProcess.Start(); } catch { MessageBox.Show("Failed to open pdf file. We need adobe reader to open the pdf file. Please make sure you have setup Adobe reader."); } } else { object missing = System.Reflection.Missing.Value; int pageNumValue = Convert.ToInt32(mostRelevantPage); bool isActive = Relocate(filePath, pageNumValue); if (!isActive) { try { Microsoft.Office.Interop.Word.Application app = new Microsoft.Office.Interop.Word.Application(); app.Visible = true; object readOnly = false; var doc = app.Documents.Open(filePath, missing, readOnly); object what = WdGoToItem.wdGoToPage; object which = WdGoToDirection.wdGoToAbsolute; Range range = app.Selection.GoTo(what, which, mostRelevantPage, missing); doc.Activate(); app.Activate(); doc.Save(); } catch (Exception) { } } } }