Example #1
0
 /* store the stemmed file as the original hierarchy
  */
 public void FileDirStemming(string sourceFile, string stemmedPath)
 {
     if (IsDirectory(sourceFile))
     {
         string[] subDirs = Directory.GetDirectories(sourceFile);
         foreach (string subDir in subDirs)
         {
             string subDirName = FileNameParse.GetFileName(subDir);
             string storePath  = stemmedPath + "\\" + subDirName;
             FileDirStemming(subDir, storePath);
         }
         string[] subFiles = Directory.GetFiles(sourceFile);
         foreach (string subFile in subFiles)
         {
             string[] fileLines      = FileOperators.ReadFileLines(subFile);
             string   fileName       = FileNameParse.GetFileName(subFile);
             string   stemmedContent = "";
             foreach (string fileLine in fileLines)
             {
                 string   stemmedLine = "";
                 string[] separators  = { " ", "," };
                 string[] terms       = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 Porter2  porter      = new Porter2();
                 foreach (string term in terms)
                 {
                     string stemmedTerm = porter.stem(term);
                     stemmedLine += stemmedTerm + " ";
                 }
                 stemmedContent += stemmedLine + "\r\n";
             }
             FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent);
         }
     }
 }
Example #2
0
        public static void ExecuteExtraction(string sourceFileName, string destFileName)
        {
            Application  _application = new Microsoft.Office.Interop.PowerPoint.Application();
            var          pres         = _application.Presentations;
            Presentation pptFile      = pres.Open(sourceFileName, MsoTriState.msoFalse, MsoTriState.msoFalse, MsoTriState.msoFalse);

            string storeContent = "";
            int    slideCount   = pptFile.Slides.Count;

            for (int i = 1; i <= slideCount; i++)
            {
                Slide slide = pptFile.Slides[i];
                slide.FollowMasterBackground = MsoTriState.msoFalse;
                foreach (var item in slide.Shapes)
                {
                    var shape = (Microsoft.Office.Interop.PowerPoint.Shape)item;
                    if (shape.HasTextFrame == MsoTriState.msoTrue)
                    {
                        //shape.Fill.ForeColor.RGB = System.Drawing.ColorTranslator.ToWin32(Color.Red);
                        var textRange = shape.TextFrame.TextRange;
                        var text      = textRange.Text;
                        storeContent += text + " ";
                    }
                }
            }

            FileOperators.FileWrite(destFileName, storeContent);
            pptFile.Close();
            _application.Quit();
        }
Example #3
0
        public static void ExecuteWordExtraction(string sourceFileName, string destFileName)
        {
            TextExtractor extractor = new TextExtractor(sourceFileName);
            string        wordText  = extractor.ExtractText();

            FileOperators.FileWrite(destFileName, wordText);
        }
Example #4
0
        //too slow, deprecated
        public static void ExecuteExtraction(string sourceFileName, string destFileName)
        {
            Application _application        = new Application();
            object      docFilenameAsObject = sourceFileName;
            Document    _document           = _application.Documents.Open(ref docFilenameAsObject);
            string      docContent          = "";

            try
            {
                int paraNum = _document.Paragraphs.Count;
                foreach (Paragraph para in _document.Paragraphs)
                {
                    Range paraRange = para.Range;
                    docContent += paraRange.Text;
                }


                FileOperators.FileWrite(destFileName, docContent);
                ((_Document)_document).Close();
                ((_Application)_application).Quit();
            }
            catch (Exception)
            {
                ((_Document)_document).Close();
                ((_Application)_application).Quit();
            }
        }
Example #5
0
        private void CleanSingleFile(string sourceFilePath, string destFileName)
        {
            string fileName = FileNameParse.GetFileName(sourceFilePath);

            string[]      oriLines       = FileOperators.ReadFileLines(sourceFilePath);
            List <string> filterDecorate = DeleteDecorate(oriLines);//contents, list of figures, list of tables, appendix

            //SplitParagraph(filterDecorate);

            List <string> cleanedText   = DetailClean(filterDecorate);
            List <string> furtherClean  = CleanMoreSpace(cleanedText);//filter the more consecutive empty space in each line
            List <string> mergedContent = MergeContent(furtherClean);

            string filteredContent = String.Join("\r\n", mergedContent);

            FileOperators.FileWrite(destFileName, filteredContent);
        }
Example #6
0
        /* intput: the component that the summary is related with.
         * compName: the target component that can be a single component or a component\\subcomponent if the target is a subcomponent
         */
        public void GenerateSummary(string compTextPath, string compName, List <string> subcompTerms, string summaryStore)
        {
            string        compPath           = compTextPath + "\\" + compName;
            List <string> subcompNames       = new List <string>();
            List <string> candidateSentences = SplitSentences(compPath); //split the paragraph into sentences
            string        summary            = "";

            if (IsDirectory(compPath))//if the target component is a 'component'
            {
                //1. create the subcomponent set. read all of the subcomponents! Here we identify the subcomponent from folders. To the subcomponent that the folders don't contain, they must be not in the critical content of components
                //2. write all of the related sentences in one document. calculate the scores of sentences in all of the related documents
                string[] subcomps = Directory.GetFiles(compPath);

                foreach (string subcomp in subcomps)
                {
                    string subcompName = FileNameParse.GetFileName(subcomp);
                    subcompNames.Add(subcompName);
                }
            }
            else //if the target component is a 'subcomponent'. Calculate the scores of the sentences in one file
            {
                string subcompName = FileNameParse.GetFileName(compPath);
                subcompNames.Add(subcompName);
                foreach (string acronym in subcompTerms)
                {
                    subcompNames.Add(acronym);
                }
            }

            MMRSummary aSummary = new MMRSummary();

            summary = aSummary.GenerateSummary(subcompNames, candidateSentences);
            if (!string.IsNullOrEmpty(summary))
            {
                FileOperators.FileWrite(summaryStore, summary);
            }
            else
            {
                Console.WriteLine("summary is empty:" + summary + ":" + compName);
            }
        }
Example #7
0
        /*generate and store the tf-idf of each file
         */
        public void calTfidf()
        {
            string titleContent = "fileName,";

            foreach (string term in tacticTerms)
            {
                titleContent += term + ";";
            }
            titleContent = titleContent.Remove(titleContent.Length - 1);
            FileOperators.FileWrite(this.storePath, titleContent);

            terms_idf = idf(); //calculate idf
            string[] fileEntries = Directory.GetFiles(fileDir);
            updateTfidf(fileEntries);
            string[] directories = Directory.GetDirectories(fileDir);
            foreach (string directory in directories)
            {
                string[] subFileEntries = Directory.GetFiles(directory);
                updateTfidf(subFileEntries);
            }
        }
Example #8
0
        /*
         * extract the component-related paragraphs according to the component-related files
         *
         *
         */
        private void ExtractCompParagraphs(string compRelatedFile, string paras, string storePath)
        {
            Regex regex = new Regex(@"[^a-zA-Z]");

            foreach (string comp in compFiles.Keys)
            {
                string        compParagraphs = "";
                List <string> relatedFiles   = compFiles[comp];
                List <string> relatedContent = new List <string>();
                if (compTerms.ContainsKey(comp))
                {
                    Dictionary <string, float> compTermWeight = compTerms[comp];
                    foreach (string file in relatedFiles)
                    {
                        string paraFile      = paras + "\\" + file;
                        string realStorePath = paraFile;
                        if (File.Exists(paraFile))
                        {
                            //do nothing
                        }
                        else
                        {
                            string tmpPath = paraFile.Replace(" ", "-");
                            if (File.Exists(tmpPath))
                            {
                                realStorePath = tmpPath;
                            }
                        }
                        if (File.Exists(realStorePath))
                        {
                            string   fileContent = FileOperators.ReadFileText(realStorePath);
                            string[] seperators  = new string[] { "\r\n\r\n" };
                            string[] paraChunks  = fileContent.Split(seperators, StringSplitOptions.RemoveEmptyEntries);
                            //   string[] fileLines = FileOperators.ReadFileLines(realStorePath);
                            foreach (string tmpPara in paraChunks)
                            {
                                if (tmpPara.Trim().Length == 0)
                                {
                                    continue;
                                }
                                else
                                {
                                    string lowerCaseLine = tmpPara.ToLower();
                                    if (IsCompRelated(lowerCaseLine, compTermWeight))
                                    {
                                        string pureWords = regex.Replace(lowerCaseLine, "");
                                        if (!relatedContent.Contains(pureWords))
                                        {
                                            relatedContent.Add(pureWords);
                                            lowerCaseLine   = lowerCaseLine.Replace("\r\n", "");
                                            compParagraphs += lowerCaseLine + "\r\n";
                                        }
                                    }
                                }
                            }
                        }
                        else
                        {
                            Console.WriteLine("Please check the existence of file: " + file);
                        }
                    }
                    FileOperators.FileWrite(storePath + comp + ".txt", compParagraphs);
                }
            }
        }