Example #1
0
 private void AddUserSearchTerms(List<string> topicTerms)
 {
     string extraSearchTermPath = Configures.GetManualSearchTermPath();
     if (!File.Exists(extraSearchTermPath))
     {
         return;
     }
     else
     {
         string[] lines = FileOperators.ReadFileLines(extraSearchTermPath);
         foreach (string line in lines)
         {
             if (line.Contains(":"))
             {
                 int commaIndex = line.IndexOf(":");
                 string compName = line.Substring(0, commaIndex);
                 if (compName.Equals(targetTopicName))
                 {
                     string extraTermStr = line.Substring(commaIndex + 1);
                     string[] extraTerms = extraTermStr.Split(',');
                     foreach (string extraTerm in extraTerms)
                     {
                         string trimmedTerm = extraTerm.Trim();
                         if (!topicTerms.Contains(trimmedTerm))
                         {
                             topicTerms.Add(trimmedTerm);
                         }
                     }
                 }
             }
         }
     }
 }
Example #2
0
        public static void ExecuteWordExtraction(string sourceFileName, string destFileName)
        {
            TextExtractor extractor = new TextExtractor(sourceFileName);
            string        wordText  = extractor.ExtractText();

            FileOperators.FileWrite(destFileName, wordText);
        }
Example #3
0
        private void CalDocDensityMap(string[] fileEntities)
        {
            foreach (string fileEntity in fileEntities)
            {
                string fileName = FileNameParse.GetFileName(fileEntity);
                if (!fileNameTopicDensityMap.ContainsKey(fileName))
                {
                    string        fileContent  = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower();
                    char[]        delimiters   = new char[] { ' ' };
                    string[]      tmpList      = fileContent.Split(delimiters, StringSplitOptions.RemoveEmptyEntries);
                    List <string> fileTermList = new List <string>();
                    foreach (string tmpTerm in tmpList)  //collect the terms in file
                    {
                        if (tmpTerm.Length > 1)
                        {
                            fileTermList.Add(tmpTerm);
                        }
                    }
                    if (fileTermList.Count == 0)
                    {
                        break;
                    }
                    Dictionary <string, float> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file

                    fileNameTopicDensityMap.Add(fileName, topicDensity);
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
                else
                {
                    Dictionary <string, float> topicDensity = fileNameTopicDensityMap[fileName];
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
            }
        }
Example #4
0
        public static void ExecuteExtraction(string sourceFileName, string destFileName)
        {
            Application  _application = new Microsoft.Office.Interop.PowerPoint.Application();
            var          pres         = _application.Presentations;
            Presentation pptFile      = pres.Open(sourceFileName, MsoTriState.msoFalse, MsoTriState.msoFalse, MsoTriState.msoFalse);

            string storeContent = "";
            int    slideCount   = pptFile.Slides.Count;

            for (int i = 1; i <= slideCount; i++)
            {
                Slide slide = pptFile.Slides[i];
                slide.FollowMasterBackground = MsoTriState.msoFalse;
                foreach (var item in slide.Shapes)
                {
                    var shape = (Microsoft.Office.Interop.PowerPoint.Shape)item;
                    if (shape.HasTextFrame == MsoTriState.msoTrue)
                    {
                        //shape.Fill.ForeColor.RGB = System.Drawing.ColorTranslator.ToWin32(Color.Red);
                        var textRange = shape.TextFrame.TextRange;
                        var text      = textRange.Text;
                        storeContent += text + " ";
                    }
                }
            }

            FileOperators.FileWrite(destFileName, storeContent);
            pptFile.Close();
            _application.Quit();
        }
Example #5
0
        /*update here in future. We didnot consider the frequency now!
         */
        private List <string> FilterNGram()
        {
            List <string> ngrams = new List <string>();

            string[] ngramLines = FileOperators.ReadFileLines(n_gramFile);

            foreach (string ngram in ngramLines)
            {
                int spaceIndex = ngram.IndexOf(" ");
                if (spaceIndex > -1)
                {
                    string   textFreqStr = ngram.Substring(0, spaceIndex);
                    string[] separators  = { "<>" };
                    string[] terms       = textFreqStr.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                    if (terms.Length == 3)
                    {
                        string first   = terms[0];
                        string second  = terms[1];
                        string freqStr = terms[2];
                        if (first.Length > 2 && second.Length > 2)
                        {
                            float freq = float.Parse(freqStr);
                            ngrams.Add(first + "<>" + second);
                        }
                    }
                    else
                    {
                        Console.WriteLine("check!");
                    }
                }
            }
            return(ngrams);
        }
Example #6
0
        public static Dictionary <string, Dictionary <string, float> > ParseTopicTerms(string topicTermsPath)
        {
            Dictionary <string, Dictionary <string, float> > topicTerms = new Dictionary <string, Dictionary <string, float> >();

            string[] topicLines = FileOperators.ReadFileLines(topicTermsPath);
            foreach (string line in topicLines)
            {
                if (line.Length == 0)
                {
                    continue;
                }

                int    colonIndex = line.IndexOf(":");
                string topicID    = line.Substring(0, colonIndex);
                Dictionary <string, float> termValueMap = new Dictionary <string, float>();

                string   termValuePart = line.Substring(colonIndex + 1);
                string[] terms         = termValuePart.Split(';');
                foreach (string termValue in terms)
                {
                    string[] termAndValue = termValue.Split(',');
                    string   term         = termAndValue[0].Trim().ToLower();
                    string   value        = termAndValue[1];
                    if (!termValueMap.ContainsKey(term))
                    {
                        termValueMap.Add(term, float.Parse(value));
                    }
                }
                topicTerms.Add(topicID, termValueMap);
            }
            return(topicTerms);
        }
Example #7
0
        private static Dictionary <string, Dictionary <string, double> > ParseTopicTerms(string topicTermsFileName)
        {
            Dictionary <string, Dictionary <string, double> > topicTermPropMap = new Dictionary <string, Dictionary <string, double> >();

            string[] topicTermLines = FileOperators.ReadFileLines(topicTermsFileName);
            for (int index = 0; index < topicTermLines.Length; index++)
            {
                string line       = topicTermLines[index];
                int    colonIndex = line.IndexOf(':');
                string topicName  = line.Substring(0, colonIndex);
                string termValues = line.Substring(colonIndex + 1);
                Dictionary <string, double> termProps = new Dictionary <string, double>();
                if (termValues.Contains(";"))
                {
                    string[] termValueList = termValues.Split(';');
                    foreach (string termValuePair in termValueList)
                    {
                        int    commaIndex = termValuePair.IndexOf(',');
                        string term       = termValuePair.Substring(0, commaIndex);
                        string propStr    = termValuePair.Substring(commaIndex + 1);
                        termProps.Add(term, double.Parse(propStr));
                    }
                }

                if (!topicTermPropMap.ContainsKey(topicName))
                {
                    topicTermPropMap.Add(topicName, termProps);
                }
            }

            return(topicTermPropMap);
        }
Example #8
0
        //too slow, deprecated
        public static void ExecuteExtraction(string sourceFileName, string destFileName)
        {
            Application _application        = new Application();
            object      docFilenameAsObject = sourceFileName;
            Document    _document           = _application.Documents.Open(ref docFilenameAsObject);
            string      docContent          = "";

            try
            {
                int paraNum = _document.Paragraphs.Count;
                foreach (Paragraph para in _document.Paragraphs)
                {
                    Range paraRange = para.Range;
                    docContent += paraRange.Text;
                }


                FileOperators.FileWrite(destFileName, docContent);
                ((_Document)_document).Close();
                ((_Application)_application).Quit();
            }
            catch (Exception)
            {
                ((_Document)_document).Close();
                ((_Application)_application).Quit();
            }
        }
Example #9
0
        /*
         * calculate the density per document
         */
        //find the density of all topics in each document
        //files in two levels
        public void DoClumpingRank()
        {
            if (File.Exists(rankResult))
            {
                File.Delete(rankResult);
            }

            string topcTermRawData = FileOperators.ReadFileText(topicTermsFilePath);

            if (string.IsNullOrEmpty(topcTermRawData))
            {
                return;
            }

            string topicTermContent = topcTermRawData.ToLower();

            topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent);

            docTopicDensityMap      = new Dictionary <string, Dictionary <string, float> >();
            fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >();

            string[] fileEntities = Directory.GetFiles(txtCleanFileDir);
            CalDocDensityMap(fileEntities);
            string[] dirs = Directory.GetDirectories(txtCleanFileDir);
            foreach (string dir in dirs)
            {
                string[] subFileEntities = Directory.GetFiles(dir);
                CalDocDensityMap(subFileEntities);
            }

            WriteRankingResult();
        }
Example #10
0
        private static Dictionary <string, List <string> > ParseTopicTerms(string topicTermFilePath)
        {
            Dictionary <string, List <string> > topicTerms = new Dictionary <string, List <string> >();

            string[] topicTermLines = FileOperators.ReadFileLines(topicTermFilePath);
            foreach (string line in topicTermLines)
            {
                int           colonIndex = line.IndexOf(':');
                string        topicID    = line.Substring(0, colonIndex);
                string        termValues = line.Substring(colonIndex + 1);
                List <string> terms      = new List <string>();
                if (termValues.Contains(";"))
                {
                    string[] termValueList = termValues.Split(';');
                    foreach (string termValuePair in termValueList)
                    {
                        int    commaIndex = termValuePair.IndexOf(',');
                        string term       = termValuePair.Substring(0, commaIndex);
                        terms.Add(term);
                    }
                }

                topicTerms.Add(topicID, terms);
            }

            return(topicTerms);
        }
Example #11
0
        /*
         * calculate the density per document
         */
        //find the density of all components in each document.
        //files in two levels
        public void DoClumpingRank(BackgroundWorker backgroundWorker)
        {
            if (File.Exists(rankResult))
            {
                File.Delete(rankResult);
            }

            OutputMg.OutputContent(backgroundWorker, "Start parsing topic terms");
            string topicTermContent = FileOperators.ReadFileText(topicTermsFilePath).ToLower();

            topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent);
            OutputMg.OutputContent(backgroundWorker, "Finished parsing topic terms.");

            OutputMg.OutputContent(backgroundWorker, "Start ranking topic");
            docTopicDensityMap      = new Dictionary <string, Dictionary <string, float> >();
            fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >();

            string[] fileEntities = Directory.GetFiles(txtCleanFileDir);
            CalDocDensityMap(fileEntities);
            string[] dirs = Directory.GetDirectories(txtCleanFileDir);
            foreach (string dir in dirs)
            {
                string[] subFileEntities = Directory.GetFiles(dir);
                //CalDocDensityMap(subFileEntities);
                CalDocDensityMap(subFileEntities);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished ranking topic");
            OutputMg.OutputContent(backgroundWorker, "Start writing ranking topic");
            WriteRankingResult();
            OutputMg.OutputContent(backgroundWorker, "Finished writing ranking topic");
        }
Example #12
0
 /* Parse the terms and their weights from the TopicManualTerms.txt
  */
 private void ParseCompTerms(string searchTerms)
 {
     string[] compLines = FileOperators.ReadFileLines(searchTerms);
     foreach (string aLine in compLines)
     {
         if (aLine.Contains(":"))
         {
             int      commaIndex  = aLine.IndexOf(":");
             string   compName    = aLine.Substring(0, commaIndex);
             string   termParts   = aLine.Substring(commaIndex + 1);
             string[] termWeights = termParts.Split(';');
             Dictionary <string, float> termWeightDic = new Dictionary <string, float>();
             foreach (string termWeight in termWeights)
             {
                 if (termWeight.Contains(","))
                 {
                     string[] tmpTerms  = termWeight.Split(',');
                     string   term      = tmpTerms[0];
                     string   weightStr = tmpTerms[1];
                     float    weight    = float.Parse(weightStr);
                     termWeightDic.Add(term, weight);
                 }
                 else
                 {
                     continue;
                 }
             }
             compTerms.Add(compName, termWeightDic);
         }
         else
         {
             continue;
         }
     }
 }
Example #13
0
        public static List <string> ParseTopicTerms(string topicTermPath, string targetTopicName)
        {
            List <string> topicTerms = new List <string>();

            string[] topicTermLines = FileOperators.ReadFileLines(topicTermPath);
            foreach (string line in topicTermLines)
            {
                if (line.StartsWith(targetTopicName))
                {
                    int    colonIndex = line.IndexOf(':');
                    string termValues = line.Substring(colonIndex + 1);
                    if (termValues.Contains(";"))
                    {
                        string[] termValueList = termValues.Split(';');
                        foreach (string termValuePair in termValueList)
                        {
                            int    commaIndex = termValuePair.IndexOf(',');
                            string term       = termValuePair.Substring(0, commaIndex).ToLower().Trim();
                            topicTerms.Add(term);
                        }
                    }
                    else
                    {
                        int    commaIndex = termValues.IndexOf(',');
                        string term       = termValues.Substring(0, commaIndex).ToLower().Trim();
                        topicTerms.Add(term);
                    }
                }
            }
            return(topicTerms);
        }
Example #14
0
 /* store the stemmed file as the original hierarchy
  */
 public void FileDirStemming(string sourceFile, string stemmedPath)
 {
     if (IsDirectory(sourceFile))
     {
         string[] subDirs = Directory.GetDirectories(sourceFile);
         foreach (string subDir in subDirs)
         {
             string subDirName = FileNameParse.GetFileName(subDir);
             string storePath  = stemmedPath + "\\" + subDirName;
             FileDirStemming(subDir, storePath);
         }
         string[] subFiles = Directory.GetFiles(sourceFile);
         foreach (string subFile in subFiles)
         {
             string[] fileLines      = FileOperators.ReadFileLines(subFile);
             string   fileName       = FileNameParse.GetFileName(subFile);
             string   stemmedContent = "";
             foreach (string fileLine in fileLines)
             {
                 string   stemmedLine = "";
                 string[] separators  = { " ", "," };
                 string[] terms       = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 Porter2  porter      = new Porter2();
                 foreach (string term in terms)
                 {
                     string stemmedTerm = porter.stem(term);
                     stemmedLine += stemmedTerm + " ";
                 }
                 stemmedContent += stemmedLine + "\r\n";
             }
             FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent);
         }
     }
 }
Example #15
0
        /*the number of times a term occurs in a document
         * for one term and one doc.
         */
        private int tf(string term, string filePath)
        {
            int    freq        = 0;
            string fileContent = FileOperators.ReadFileText(filePath);

            freq = (fileContent.Length - fileContent.Replace(term, "").Length) / term.Length;
            return(freq);
        }
Example #16
0
        public static void ExecuteExtraction(string sourceFileName, string destFileName)
        {
            Application _application = new Application();
            Workbook    oriWorkBook  = _application.Workbooks.Open(sourceFileName);

            int sheetNum = oriWorkBook.Worksheets.Count;

            for (int i = 1; i <= sheetNum; i++)
            {
                string    fileContent = "";
                Worksheet xlWorkSheet = oriWorkBook.Worksheets[i];

                // Detect Last used Row - Ignore cells that contains formulas that result in blank values
                int lastRowIgnoreFormulas = xlWorkSheet.Cells.Find(
                    "*",
                    System.Reflection.Missing.Value,
                    Microsoft.Office.Interop.Excel.XlFindLookIn.xlValues,
                    Microsoft.Office.Interop.Excel.XlLookAt.xlWhole,
                    Microsoft.Office.Interop.Excel.XlSearchOrder.xlByRows,
                    Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious,
                    false,
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value).Row;
                // Detect Last Used Column  - Ignore cells that contains formulas that result in blank values
                int lastColIgnoreFormulas = xlWorkSheet.Cells.Find(
                    "*",
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value,
                    Microsoft.Office.Interop.Excel.XlSearchOrder.xlByColumns,
                    Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious,
                    false,
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value).Column;

                // Detect Last used Row / Column - Including cells that contains formulas that result in blank values
                for (int j = 1; j <= lastRowIgnoreFormulas; j++)
                {
                    for (int k = 1; k <= lastColIgnoreFormulas; k++)
                    {
                        Range  usedRange = xlWorkSheet.Cells[j, k];
                        string rangeText = usedRange.Text;
                        fileContent += rangeText + " ";
                    }
                }

                FileOperators.FileAppend(destFileName, fileContent);
            }

            oriWorkBook.Close(XlSaveAction.xlDoNotSaveChanges);
            _application.Application.Quit();
        }
Example #17
0
        /*
         * parse component and the related files from the results of ranking
         */
        private void ParseCompFiles(string compRelatedFiles)
        {
            string fileContent = FileOperators.ReadFileText(compRelatedFiles);

            string[] seperators = new string[] { "\r\n\r\n" };
            string[] compChunks = fileContent.Split(seperators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string compChunk in compChunks)
            {
                string[]      lineSeperator = new string[] { "\r\n" };
                string[]      chunkLines    = compChunk.Split(lineSeperator, StringSplitOptions.RemoveEmptyEntries);
                string        firstLine     = chunkLines[0];
                string        compName      = "";
                List <string> relatedFiles  = new List <string>();
                if (firstLine.Contains('\t'))
                {
                    string[] lineTerms = firstLine.Split('\t');
                    compName = lineTerms[0];
                    string fileName = lineTerms[1];
                    if (fileName.Contains(@"\"))
                    {
                        int fileNameIndex = fileName.LastIndexOf(@"\");
                        fileName = fileName.Substring(fileNameIndex + 1);
                    }
                    relatedFiles.Add(fileName);
                }
                else
                {
                    Console.WriteLine("wrong in the compFile! " + firstLine);
                }
                int lineScale = chunkLines.Count();
                for (int i = 1; i < lineScale; i++)
                {
                    string curLine = chunkLines[i];
                    if (curLine.Contains('\t'))
                    {
                        string[] lineTerms = curLine.Split('\t');
                        string   fileName  = lineTerms[1];
                        if (fileName.Contains(@"\"))
                        {
                            int fileNameIndex = fileName.LastIndexOf(@"\");
                            fileName = fileName.Substring(fileNameIndex + 1);
                        }
                        relatedFiles.Add(fileName);
                    }
                    else
                    {
                        continue;
                    }
                }
                compFiles.Add(compName, relatedFiles);
            }
        }
Example #18
0
        private void WriteDomainTerms(Dictionary <string, List <string> > componentTerms)
        {
            string domainTermContent = "";

            //write component level info
            foreach (string component in componentTerms.Keys)
            {
                List <string> terms = componentTerms[component];
                if (terms.Count == 0)
                {
                    continue;
                }
                //float termCount = terms.Count;
                //float prop = 1/termCount;
                domainTermContent += component + ":";
                if (component.Contains(","))
                {
                    string[] componentNames = component.Split(',');
                    foreach (string name in componentNames)
                    {
                        domainTermContent += name + "," + 1.0 + ";";
                    }
                }
                else
                {
                    domainTermContent += component + "," + 1.0 + ";";
                }

                foreach (string term in terms)
                {
                    domainTermContent += term + "," + 0.2 + ";";
                }
                if (domainTermContent.EndsWith(";"))
                {
                    domainTermContent = domainTermContent.Remove(domainTermContent.Length - 1).ToLower() + "\r\n";
                }
                else
                {
                    domainTermContent += "\r\n";
                }
            }

            if (domainTermContent.EndsWith("\r\n"))
            {
                domainTermContent = domainTermContent.Remove(domainTermContent.Length - 2).ToLower();
            }

            FileOperators.FileAppend(fixedFormatPath, domainTermContent);
        }
Example #19
0
        public static void ExecuteExtraction(string sourceFileName, string destFileName, BackgroundWorker backgroundWorker = null)
        {
            PdfReader reader = new PdfReader(sourceFileName);

            int pdfNum = reader.NumberOfPages;

            for (int i = 1; i <= pdfNum; i++)
            {
                var textPos = new MyLocationExtractionStrategy();

                //Create an instance of our strategy
                string ex = PdfTextExtractor.GetTextFromPage(reader, i, textPos); //store the text and the position info in textPos
                FileOperators.FileAppend(destFileName, ex);
            }
        }
Example #20
0
        private Dictionary <string, string> mapTopicIDName()
        {
            Dictionary <string, string> IDNameMap = new Dictionary <string, string>();

            if (this.topicNamePath.Length > 0)
            {
                string[] fileContent = FileOperators.ReadFileLines(this.topicNamePath);
                foreach (string line in fileContent)
                {
                    string[] terms     = line.Split(':');
                    string   topicID   = terms[0];
                    string   topicName = terms[1];
                    IDNameMap.Add(topicID, topicName);
                }
            }
            return(IDNameMap);
        }
Example #21
0
        private void CleanSingleFile(string sourceFilePath, string destFileName)
        {
            string fileName = FileNameParse.GetFileName(sourceFilePath);

            string[]      oriLines       = FileOperators.ReadFileLines(sourceFilePath);
            List <string> filterDecorate = DeleteDecorate(oriLines);//contents, list of figures, list of tables, appendix

            //SplitParagraph(filterDecorate);

            List <string> cleanedText   = DetailClean(filterDecorate);
            List <string> furtherClean  = CleanMoreSpace(cleanedText);//filter the more consecutive empty space in each line
            List <string> mergedContent = MergeContent(furtherClean);

            string filteredContent = String.Join("\r\n", mergedContent);

            FileOperators.FileWrite(destFileName, filteredContent);
        }
Example #22
0
        private void getAllTopicTerms()
        {
            Dictionary <string, Dictionary <string, string> > topicTerms = new Dictionary <string, Dictionary <string, string> >();

            string[] topicLines = FileOperators.ReadFileLines(topicTermsPath);
            Dictionary <string, string> topicIDNameMap = mapTopicIDName();

            foreach (string line in topicLines)
            {
                if (line.Length == 0)
                {
                    continue;
                }

                int    colonIndex = line.IndexOf(":");
                string topicID    = line.Substring(0, colonIndex);
                string topicName;
                if (topicIDNameMap.ContainsKey(topicID))
                {
                    topicName = topicIDNameMap[topicID];
                }
                else
                {
                    topicName = topicID;
                }
                Dictionary <string, string> termValueMap = new Dictionary <string, string>();

                string   termValuePart = line.Substring(colonIndex + 1);
                string[] terms         = termValuePart.Split(';');
                foreach (string termValue in terms)
                {
                    string[] termAndValue = termValue.Split(',');
                    string   term         = termAndValue[0].Trim().ToLower();
                    string   value        = termAndValue[1];
                    if (!termValueMap.ContainsKey(term))
                    {
                        termValueMap.Add(term, value);
                    }
                }
                topicTerms.Add(topicName, termValueMap);
            }

            NormalizeTopicRelevance normalizer = new NormalizeTopicRelevance();

            normalizedTopicTerms = normalizer.DoNormalize(topicTerms);
        }
Example #23
0
        /*parse the summary.txt file of the standford tmt results
         */
        public static void Execute(string sourceFilePath, string destFilePath)
        {
            if (File.Exists(destFilePath))
            {
                File.Delete(destFilePath);
            }

            string topicContent = FileOperators.ReadFileText(sourceFilePath);

            string[] stringSeparators = new string[] { "\r\n\n\r\n" };
            string[] topicParts       = topicContent.Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string topicPart in topicParts)
            {
                string[] topicPartSeparators = new string[] { "\r\n" };
                string[] lines = topicPart.Split(topicPartSeparators, StringSplitOptions.RemoveEmptyEntries);
                string   singleTopicContent = "";
                float    topicSumWeight     = 0f;
                foreach (string line in lines)
                {
                    if (line.StartsWith("Topic"))
                    {
                        int    spaceIndex  = line.IndexOf("\t\t");
                        string topicId     = line.Substring(0, spaceIndex);
                        string topicWeight = line.Substring(spaceIndex + 2);
                        topicSumWeight      = float.Parse(topicWeight);
                        singleTopicContent += topicId + ":";
                    }
                    else
                    {
                        string   trimedLine     = line.Trim();
                        string[] terms          = trimedLine.Split('\t');
                        string   term           = terms[0];
                        string   relevance      = terms[1];
                        float    relevanceValue = float.Parse(relevance);
                        relevanceValue      = relevanceValue / topicSumWeight;
                        singleTopicContent += term + "," + relevanceValue + ";";
                    }
                }

                if (singleTopicContent != "")
                {
                    singleTopicContent = singleTopicContent.Remove(singleTopicContent.Length - 1);
                    FileOperators.FileAppend(destFilePath, singleTopicContent);
                }
            }
        }
Example #24
0
        /* intput: the component that the summary is related with.
         * compName: the target component that can be a single component or a component\\subcomponent if the target is a subcomponent
         */
        public void GenerateSummary(string compTextPath, string compName, List <string> subcompTerms, string summaryStore)
        {
            string        compPath           = compTextPath + "\\" + compName;
            List <string> subcompNames       = new List <string>();
            List <string> candidateSentences = SplitSentences(compPath); //split the paragraph into sentences
            string        summary            = "";

            if (IsDirectory(compPath))//if the target component is a 'component'
            {
                //1. create the subcomponent set. read all of the subcomponents! Here we identify the subcomponent from folders. To the subcomponent that the folders don't contain, they must be not in the critical content of components
                //2. write all of the related sentences in one document. calculate the scores of sentences in all of the related documents
                string[] subcomps = Directory.GetFiles(compPath);

                foreach (string subcomp in subcomps)
                {
                    string subcompName = FileNameParse.GetFileName(subcomp);
                    subcompNames.Add(subcompName);
                }
            }
            else //if the target component is a 'subcomponent'. Calculate the scores of the sentences in one file
            {
                string subcompName = FileNameParse.GetFileName(compPath);
                subcompNames.Add(subcompName);
                foreach (string acronym in subcompTerms)
                {
                    subcompNames.Add(acronym);
                }
            }

            MMRSummary aSummary = new MMRSummary();

            summary = aSummary.GenerateSummary(subcompNames, candidateSentences);
            if (!string.IsNullOrEmpty(summary))
            {
                FileOperators.FileWrite(summaryStore, summary);
            }
            else
            {
                Console.WriteLine("summary is empty:" + summary + ":" + compName);
            }
        }
Example #25
0
        /*generate and store the tf-idf of each file
         */
        public void calTfidf()
        {
            string titleContent = "fileName,";

            foreach (string term in tacticTerms)
            {
                titleContent += term + ";";
            }
            titleContent = titleContent.Remove(titleContent.Length - 1);
            FileOperators.FileWrite(this.storePath, titleContent);

            terms_idf = idf(); //calculate idf
            string[] fileEntries = Directory.GetFiles(fileDir);
            updateTfidf(fileEntries);
            string[] directories = Directory.GetDirectories(fileDir);
            foreach (string directory in directories)
            {
                string[] subFileEntries = Directory.GetFiles(directory);
                updateTfidf(subFileEntries);
            }
        }
Example #26
0
        private void updateTfidf(string[] fileEntries)
        {
            foreach (string fileName in fileEntries)
            {
                //string storeFilePath = this.storePath + "/" + fileName;
                //int slashIndex = fileName.LastIndexOf('\\');
                //string filteredFileName = fileName.Substring(0, slashIndex);

                string tfidfContent = fileName + ";";
                foreach (string term in tacticTerms)
                {
                    int    tf_value = tf(term, fileName);
                    double term_idf = terms_idf[term];
                    double tf_idf   = tf_value * term_idf;
                    tfidfContent += tf_idf + ";";
                }
                tfidfContent = tfidfContent.Remove(tfidfContent.Length - 1);

                FileOperators.FileAppend(this.storePath, tfidfContent);
            }
        }
Example #27
0
        private void WriteRankingResult()
        {
            Dictionary <string, Dictionary <string, float> > topicDocDensity = new Dictionary <string, Dictionary <string, float> >();

            foreach (string doc in docTopicDensityMap.Keys)  //transfer the primaryKey from docId to topicId
            {
                Dictionary <string, float> topicDensity = docTopicDensityMap[doc];
                foreach (string topicName in topicDensity.Keys)
                {
                    float density = topicDensity[topicName];
                    if (topicDocDensity.ContainsKey(topicName))
                    {
                        topicDocDensity[topicName].Add(doc, density);
                    }
                    else
                    {
                        Dictionary <string, float> docDensity = new Dictionary <string, float>();
                        docDensity.Add(doc, density);
                        topicDocDensity.Add(topicName, docDensity);
                    }
                }
            }

            foreach (string topicName in topicDocDensity.Keys)
            {
                Dictionary <string, float> relevantDocDensity = topicDocDensity[topicName];
                Dictionary <string, float> sortedDocDensity   = DictionaryDecreasedSort.DecreasedByValue(relevantDocDensity);
                string docAndDensity = "";
                foreach (string key in sortedDocDensity.Keys)
                {
                    string fileName = key.Substring(filePathLength);
                    float  freq     = sortedDocDensity[key];
                    if (freq > 0)
                    {
                        docAndDensity += topicName + "\t" + fileName + "\t" + sortedDocDensity[key] + "\r\n";
                    }
                }
                FileOperators.FileAppend(rankResult, docAndDensity);
            }
        }
Example #28
0
        private void CalDocDensityMap(string[] fileEntities)
        {
            foreach (string fileEntity in fileEntities)
            {
                int    lastSlashIndex = fileEntity.LastIndexOf('\\');
                string fileName       = fileEntity.Substring(lastSlashIndex + 1);

                if (!fileNameTopicDensityMap.ContainsKey(fileName))
                {
                    string                   fileContent  = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower();
                    List <string>            fileTermList = new List <string>(fileContent.Split(' '));
                    Dictionary <string, int> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file

                    fileNameTopicDensityMap.Add(fileName, topicDensity);
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
                else
                {
                    Dictionary <string, int> topicDensity = fileNameTopicDensityMap[fileName];
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
            }
        }
Example #29
0
        public void executeRank()
        {
            getAllTopicTerms(); //get topic and the related terms, and do the normalization

            int txtDirLength = docsPath.Length;

            foreach (KeyValuePair <string, Dictionary <string, float> > entry in normalizedTopicTerms)
            {
                string topicName = entry.Key; //just the topic ID


                Dictionary <string, float> termAndValues = entry.Value;

                List <string> terms = new List <string>(termAndValues.Keys);

                List <float> queryVector = new List <float>(termAndValues.Values);

                // topicName = topicName.Replace(" ", string.Empty);

                tfidfStore = docsPath + "-ifidf\\" + topicName + ".csv";
                //for each document, generate the ifidf according to the keyterms of topic
                TFIDF tfidf = new TFIDF(terms, this.docsPath, tfidfStore);
                tfidf.calTfidf();

                string[] tfidfLines = FileOperators.ReadFileLines(tfidfStore);

                int lineScale = tfidfLines.Length;

                VSM vsm = new VSM();

                string simContent = "";

                Dictionary <string, double> docAndRelevance = new Dictionary <string, double>();

                for (int i = 1; i < lineScale; i++)
                {
                    string curLine    = tfidfLines[i];
                    int    firstComma = curLine.IndexOf(';');
                    string fileName   = curLine.Substring(0, firstComma); //test if the length is right

                    string       valueStr   = curLine.Substring(firstComma + 1);
                    string[]     valueTerms = valueStr.Split(';');
                    List <float> docVector  = new List <float>();
                    foreach (string valueTerm in valueTerms)
                    {
                        float value = float.Parse(valueTerm);
                        docVector.Add(value);
                    }
                    double sim = vsm.calSimilarity(docVector, queryVector);
                    if (sim > 0)
                    {
                        docAndRelevance.Add(fileName, sim); //get the similarity between doc and topic
                    }
                }

                //execute decrease sorting on the docAndRelevance
                Dictionary <string, double> sortedByRelevance = DictionaryDecreasedSort.DecreasedByValue(docAndRelevance);
                foreach (string key in sortedByRelevance.Keys)
                {
                    double similarity = sortedByRelevance[key];
                    string fileName   = key.Substring(txtDirLength);
                    simContent += topicName + "\t" + fileName + "\t" + similarity + "\r\n";
                }
                FileOperators.FileAppend(simStorePath, simContent); //simStorePath should contain relativePath
            }
            Console.WriteLine("DONE!!");
        }
Example #30
0
        /*
         * oriPDFPath: the pdf documents are stored in a directory
         */
        public void CountDocRelativeSeq(string resultFile, string oriPDFPath)
        {
            //parse component and the related files
            Dictionary <string, List <string> > compFiles = new Dictionary <string, List <string> >();

            foreach (string compDir in Directory.GetDirectories(oriPDFPath))
            {
                string        compName = FileNameParse.GetFileName(compDir);
                List <string> fileList = new List <string>();
                foreach (string filePath in Directory.GetFiles(compDir))
                {
                    string fileName = FileNameParse.GetFileName(filePath);
                    fileList.Add(fileName);
                }
                compFiles.Add(compName, fileList);
            }

            //read the result file
            string resultFileContent = FileOperators.ReadFileText(resultFile);

            string[] separators = new string[] { "\r\n\r\n" };
            string[] compChunks = resultFileContent.Split(separators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string compChunk in compChunks)
            {
                string[] lineSeparator = new string[] { "\r\n" };
                string[] lines         = compChunk.Split(lineSeparator, StringSplitOptions.RemoveEmptyEntries);
                string   firstLine     = lines[0];
                if (firstLine.Contains("\t"))
                {
                    string[] lineTerms = firstLine.Split('\t');
                    string   compName  = lineTerms[0];

                    if (compName.Contains(","))
                    {
                        int      commaIndex     = compName.IndexOf(',');
                        string[] candidateComps = compName.Split(',');
                        int      candidateScale = candidateComps.Count();

                        bool matched        = false;
                        int  candidateIndex = 0;
                        while (!matched && candidateIndex < candidateScale)
                        {
                            string candidate = candidateComps[candidateIndex].Trim();
                            foreach (string fileComp in compFiles.Keys)
                            {
                                if (candidate.Equals(fileComp.ToLower()))
                                {
                                    List <string> targetFiles = compFiles[fileComp];
                                    ReadResultFile(fileComp, lines, targetFiles);
                                    matched = true;
                                    break;
                                }
                            }
                            candidateIndex++;
                        }
                    }
                    else
                    {
                        foreach (string fileComp in compFiles.Keys)
                        {
                            if (fileComp.Equals(compName))
                            {
                                List <string> targetFiles = compFiles[fileComp];
                                ReadResultFile(fileComp, lines, targetFiles);
                                break;
                            }
                        }
                    }
                }
            }
        }