Пример #1
0
        private void CalDocDensityMap(string[] fileEntities)
        {
            foreach (string fileEntity in fileEntities)
            {
                string fileName = FileNameParse.GetFileName(fileEntity);
                if (!fileNameTopicDensityMap.ContainsKey(fileName))
                {
                    string        fileContent  = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower();
                    char[]        delimiters   = new char[] { ' ' };
                    string[]      tmpList      = fileContent.Split(delimiters, StringSplitOptions.RemoveEmptyEntries);
                    List <string> fileTermList = new List <string>();
                    foreach (string tmpTerm in tmpList)  //collect the terms in file
                    {
                        if (tmpTerm.Length > 1)
                        {
                            fileTermList.Add(tmpTerm);
                        }
                    }
                    if (fileTermList.Count == 0)
                    {
                        break;
                    }
                    Dictionary <string, float> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file

                    fileNameTopicDensityMap.Add(fileName, topicDensity);
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
                else
                {
                    Dictionary <string, float> topicDensity = fileNameTopicDensityMap[fileName];
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
            }
        }
Пример #2
0
        /*
         * calculate the density per document
         */
        //find the density of all topics in each document
        //files in two levels
        public void DoClumpingRank()
        {
            if (File.Exists(rankResult))
            {
                File.Delete(rankResult);
            }

            string topcTermRawData = FileOperators.ReadFileText(topicTermsFilePath);

            if (string.IsNullOrEmpty(topcTermRawData))
            {
                return;
            }

            string topicTermContent = topcTermRawData.ToLower();

            topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent);

            docTopicDensityMap      = new Dictionary <string, Dictionary <string, float> >();
            fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >();

            string[] fileEntities = Directory.GetFiles(txtCleanFileDir);
            CalDocDensityMap(fileEntities);
            string[] dirs = Directory.GetDirectories(txtCleanFileDir);
            foreach (string dir in dirs)
            {
                string[] subFileEntities = Directory.GetFiles(dir);
                CalDocDensityMap(subFileEntities);
            }

            WriteRankingResult();
        }
Пример #3
0
        /*
         * calculate the density per document
         */
        //find the density of all components in each document.
        //files in two levels
        public void DoClumpingRank(BackgroundWorker backgroundWorker)
        {
            if (File.Exists(rankResult))
            {
                File.Delete(rankResult);
            }

            OutputMg.OutputContent(backgroundWorker, "Start parsing topic terms");
            string topicTermContent = FileOperators.ReadFileText(topicTermsFilePath).ToLower();

            topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent);
            OutputMg.OutputContent(backgroundWorker, "Finished parsing topic terms.");

            OutputMg.OutputContent(backgroundWorker, "Start ranking topic");
            docTopicDensityMap      = new Dictionary <string, Dictionary <string, float> >();
            fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >();

            string[] fileEntities = Directory.GetFiles(txtCleanFileDir);
            CalDocDensityMap(fileEntities);
            string[] dirs = Directory.GetDirectories(txtCleanFileDir);
            foreach (string dir in dirs)
            {
                string[] subFileEntities = Directory.GetFiles(dir);
                //CalDocDensityMap(subFileEntities);
                CalDocDensityMap(subFileEntities);
            }

            OutputMg.OutputContent(backgroundWorker, "Finished ranking topic");
            OutputMg.OutputContent(backgroundWorker, "Start writing ranking topic");
            WriteRankingResult();
            OutputMg.OutputContent(backgroundWorker, "Finished writing ranking topic");
        }
Пример #4
0
        /*the number of times a term occurs in a document
         * for one term and one doc.
         */
        private int tf(string term, string filePath)
        {
            int    freq        = 0;
            string fileContent = FileOperators.ReadFileText(filePath);

            freq = (fileContent.Length - fileContent.Replace(term, "").Length) / term.Length;
            return(freq);
        }
Пример #5
0
        /*
         * parse component and the related files from the results of ranking
         */
        private void ParseCompFiles(string compRelatedFiles)
        {
            string fileContent = FileOperators.ReadFileText(compRelatedFiles);

            string[] seperators = new string[] { "\r\n\r\n" };
            string[] compChunks = fileContent.Split(seperators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string compChunk in compChunks)
            {
                string[]      lineSeperator = new string[] { "\r\n" };
                string[]      chunkLines    = compChunk.Split(lineSeperator, StringSplitOptions.RemoveEmptyEntries);
                string        firstLine     = chunkLines[0];
                string        compName      = "";
                List <string> relatedFiles  = new List <string>();
                if (firstLine.Contains('\t'))
                {
                    string[] lineTerms = firstLine.Split('\t');
                    compName = lineTerms[0];
                    string fileName = lineTerms[1];
                    if (fileName.Contains(@"\"))
                    {
                        int fileNameIndex = fileName.LastIndexOf(@"\");
                        fileName = fileName.Substring(fileNameIndex + 1);
                    }
                    relatedFiles.Add(fileName);
                }
                else
                {
                    Console.WriteLine("wrong in the compFile! " + firstLine);
                }
                int lineScale = chunkLines.Count();
                for (int i = 1; i < lineScale; i++)
                {
                    string curLine = chunkLines[i];
                    if (curLine.Contains('\t'))
                    {
                        string[] lineTerms = curLine.Split('\t');
                        string   fileName  = lineTerms[1];
                        if (fileName.Contains(@"\"))
                        {
                            int fileNameIndex = fileName.LastIndexOf(@"\");
                            fileName = fileName.Substring(fileNameIndex + 1);
                        }
                        relatedFiles.Add(fileName);
                    }
                    else
                    {
                        continue;
                    }
                }
                compFiles.Add(compName, relatedFiles);
            }
        }
Пример #6
0
        /*parse the summary.txt file of the standford tmt results
         */
        public static void Execute(string sourceFilePath, string destFilePath)
        {
            if (File.Exists(destFilePath))
            {
                File.Delete(destFilePath);
            }

            string topicContent = FileOperators.ReadFileText(sourceFilePath);

            string[] stringSeparators = new string[] { "\r\n\n\r\n" };
            string[] topicParts       = topicContent.Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string topicPart in topicParts)
            {
                string[] topicPartSeparators = new string[] { "\r\n" };
                string[] lines = topicPart.Split(topicPartSeparators, StringSplitOptions.RemoveEmptyEntries);
                string   singleTopicContent = "";
                float    topicSumWeight     = 0f;
                foreach (string line in lines)
                {
                    if (line.StartsWith("Topic"))
                    {
                        int    spaceIndex  = line.IndexOf("\t\t");
                        string topicId     = line.Substring(0, spaceIndex);
                        string topicWeight = line.Substring(spaceIndex + 2);
                        topicSumWeight      = float.Parse(topicWeight);
                        singleTopicContent += topicId + ":";
                    }
                    else
                    {
                        string   trimedLine     = line.Trim();
                        string[] terms          = trimedLine.Split('\t');
                        string   term           = terms[0];
                        string   relevance      = terms[1];
                        float    relevanceValue = float.Parse(relevance);
                        relevanceValue      = relevanceValue / topicSumWeight;
                        singleTopicContent += term + "," + relevanceValue + ";";
                    }
                }

                if (singleTopicContent != "")
                {
                    singleTopicContent = singleTopicContent.Remove(singleTopicContent.Length - 1);
                    FileOperators.FileAppend(destFilePath, singleTopicContent);
                }
            }
        }
Пример #7
0
        private void CalDocDensityMap(string[] fileEntities)
        {
            foreach (string fileEntity in fileEntities)
            {
                int    lastSlashIndex = fileEntity.LastIndexOf('\\');
                string fileName       = fileEntity.Substring(lastSlashIndex + 1);

                if (!fileNameTopicDensityMap.ContainsKey(fileName))
                {
                    string                   fileContent  = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower();
                    List <string>            fileTermList = new List <string>(fileContent.Split(' '));
                    Dictionary <string, int> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file

                    fileNameTopicDensityMap.Add(fileName, topicDensity);
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
                else
                {
                    Dictionary <string, int> topicDensity = fileNameTopicDensityMap[fileName];
                    docTopicDensityMap.Add(fileEntity, topicDensity);
                }
            }
        }
Пример #8
0
        /*
         * oriPDFPath: the pdf documents are stored in a directory
         */
        public void CountDocRelativeSeq(string resultFile, string oriPDFPath)
        {
            //parse component and the related files
            Dictionary <string, List <string> > compFiles = new Dictionary <string, List <string> >();

            foreach (string compDir in Directory.GetDirectories(oriPDFPath))
            {
                string        compName = FileNameParse.GetFileName(compDir);
                List <string> fileList = new List <string>();
                foreach (string filePath in Directory.GetFiles(compDir))
                {
                    string fileName = FileNameParse.GetFileName(filePath);
                    fileList.Add(fileName);
                }
                compFiles.Add(compName, fileList);
            }

            //read the result file
            string resultFileContent = FileOperators.ReadFileText(resultFile);

            string[] separators = new string[] { "\r\n\r\n" };
            string[] compChunks = resultFileContent.Split(separators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string compChunk in compChunks)
            {
                string[] lineSeparator = new string[] { "\r\n" };
                string[] lines         = compChunk.Split(lineSeparator, StringSplitOptions.RemoveEmptyEntries);
                string   firstLine     = lines[0];
                if (firstLine.Contains("\t"))
                {
                    string[] lineTerms = firstLine.Split('\t');
                    string   compName  = lineTerms[0];

                    if (compName.Contains(","))
                    {
                        int      commaIndex     = compName.IndexOf(',');
                        string[] candidateComps = compName.Split(',');
                        int      candidateScale = candidateComps.Count();

                        bool matched        = false;
                        int  candidateIndex = 0;
                        while (!matched && candidateIndex < candidateScale)
                        {
                            string candidate = candidateComps[candidateIndex].Trim();
                            foreach (string fileComp in compFiles.Keys)
                            {
                                if (candidate.Equals(fileComp.ToLower()))
                                {
                                    List <string> targetFiles = compFiles[fileComp];
                                    ReadResultFile(fileComp, lines, targetFiles);
                                    matched = true;
                                    break;
                                }
                            }
                            candidateIndex++;
                        }
                    }
                    else
                    {
                        foreach (string fileComp in compFiles.Keys)
                        {
                            if (fileComp.Equals(compName))
                            {
                                List <string> targetFiles = compFiles[fileComp];
                                ReadResultFile(fileComp, lines, targetFiles);
                                break;
                            }
                        }
                    }
                }
            }
        }
Пример #9
0
        /*
         * extract the component-related paragraphs according to the component-related files
         *
         *
         */
        private void ExtractCompParagraphs(string compRelatedFile, string paras, string storePath)
        {
            Regex regex = new Regex(@"[^a-zA-Z]");

            foreach (string comp in compFiles.Keys)
            {
                string        compParagraphs = "";
                List <string> relatedFiles   = compFiles[comp];
                List <string> relatedContent = new List <string>();
                if (compTerms.ContainsKey(comp))
                {
                    Dictionary <string, float> compTermWeight = compTerms[comp];
                    foreach (string file in relatedFiles)
                    {
                        string paraFile      = paras + "\\" + file;
                        string realStorePath = paraFile;
                        if (File.Exists(paraFile))
                        {
                            //do nothing
                        }
                        else
                        {
                            string tmpPath = paraFile.Replace(" ", "-");
                            if (File.Exists(tmpPath))
                            {
                                realStorePath = tmpPath;
                            }
                        }
                        if (File.Exists(realStorePath))
                        {
                            string   fileContent = FileOperators.ReadFileText(realStorePath);
                            string[] seperators  = new string[] { "\r\n\r\n" };
                            string[] paraChunks  = fileContent.Split(seperators, StringSplitOptions.RemoveEmptyEntries);
                            //   string[] fileLines = FileOperators.ReadFileLines(realStorePath);
                            foreach (string tmpPara in paraChunks)
                            {
                                if (tmpPara.Trim().Length == 0)
                                {
                                    continue;
                                }
                                else
                                {
                                    string lowerCaseLine = tmpPara.ToLower();
                                    if (IsCompRelated(lowerCaseLine, compTermWeight))
                                    {
                                        string pureWords = regex.Replace(lowerCaseLine, "");
                                        if (!relatedContent.Contains(pureWords))
                                        {
                                            relatedContent.Add(pureWords);
                                            lowerCaseLine   = lowerCaseLine.Replace("\r\n", "");
                                            compParagraphs += lowerCaseLine + "\r\n";
                                        }
                                    }
                                }
                            }
                        }
                        else
                        {
                            Console.WriteLine("Please check the existence of file: " + file);
                        }
                    }
                    FileOperators.FileWrite(storePath + comp + ".txt", compParagraphs);
                }
            }
        }
Пример #10
0
        public List <string> SplitSingleFileSentence(string filePath)
        {
            //result sentences
            List <string> sentences = new List <string>();

            Regex funnySign = new Regex("[^\\w\\d\\p{P}\\s]");
            //split file into paragraphs firstly
            string fileContent = FileOperators.ReadFileText(filePath);

            string[] paras = fileContent.Split(new char[] { '\r', '\n' });

            foreach (string tmpPara in paras)
            {
                if (tmpPara.Trim().Length == 0)
                {
                    continue;
                }
                else if (!tmpPara.Contains(",") && !tmpPara.Contains(".") && !tmpPara.Contains("!") && !tmpPara.Contains(";") && !tmpPara.Contains(":"))
                {
                    continue;
                }
                string[] pieces = Regex.Split(tmpPara, "(?<=[.?!])\\s+(?=[a-zA-Z])");
                // string[] pieces = curLine.Split(seperator,StringSplitOptions.RemoveEmptyEntries);
                foreach (string piece in pieces)
                {
                    if (piece.Contains("appendix") || piece.Contains("standard") || piece.Contains("figure") || piece.Contains("table") || piece.Contains("section") || piece.Contains("version") || piece.Contains("http") || piece.Contains("www") || piece.Contains("error! reference"))
                    {
                        continue;
                    }
                    else if (piece.StartsWith("acronym") || piece.StartsWith("definition") || piece.EndsWith("as shown in fig") || piece.StartsWith("chapter"))
                    {
                        continue;
                    }

                    //if filteredPiece is the substring of piece, starting of which is a character, actually I can use regex to replace it
                    /////////////////////////////////////////////////////////////////////////////////////////////////////
                    //add for the content without clean operation beforehand
                    string filteredPiece = piece.Trim();
                    for (int i = 0; i < piece.Length; i++)
                    {
                        char curChar = piece[i];
                        if ((curChar > 'z' || curChar < 'a') && (curChar > 'Z' || curChar < 'A'))
                        {
                            continue;
                        }
                        else
                        {
                            filteredPiece = piece.Substring(i).Trim();
                            break;
                        }
                    }

                    if (filteredPiece.Length > 1)
                    {
                        char endChar = filteredPiece[filteredPiece.Length - 1];
                        if (endChar >= '0' && endChar <= '9')
                        {
                            continue;
                        }
                    }

                    if (funnySign.Match(filteredPiece).Success)
                    {
                        filteredPiece = funnySign.Replace(filteredPiece, " ");
                    }

                    if (IsSynonym(filteredPiece)) //if the sentence is to define the synonym, ignore it
                    {
                        continue;
                    }

                    if (IsSectionHead(filteredPiece))
                    {
                        continue;
                    }



                    if (ContainEnoughInfo(filteredPiece) && !filteredPiece.Contains("the following") && !filteredPiece.StartsWith("commentary") && !filteredPiece.StartsWith("see") && !filteredPiece.StartsWith("of") && !filteredPiece.StartsWith("description:") && !filteredPiece.StartsWith("content"))
                    {
                        sentences.Add(filteredPiece);
                    }
                }
            }


            return(sentences);

            /*
             * string[] fileLines = FileOperators.ReadFileLines(filePath);
             *
             * //  Regex funnySign = new Regex("[^a-zA-Z0-9.,;:()\"\'/-]");
             * Regex funnySign = new Regex("[^\\w\\d\\p{P}\\s]");
             * if (fileLines == null)
             * {
             *  return null;
             * }
             *
             * foreach (string aLine in fileLines)
             * {
             *  if (aLine.Contains("&"))
             *  {
             *      continue;
             *  }
             *
             *  if (!aLine.Contains(",") && !aLine.Contains(".") && !aLine.Contains("!") && !aLine.Contains(";") && !aLine.Contains(":"))
             *  {
             *      continue;
             *  }
             *
             *  string curLine = aLine;
             *
             *  //if (curLine.Contains("e.g."))
             *  //{
             *  //    curLine = curLine.Replace("e.g.", "e,g,");
             *  //}
             *  //if (curLine.Contains("e .g ."))
             *  //{
             *  //    curLine = curLine.Replace("e .g .", "e,g,");
             *  //}
             *  //if (curLine.Contains("i.e.") )
             *  //{
             *  //    curLine = curLine.Replace("i.e.", "i,e,");
             *  //}
             *  //if (curLine.Contains("i .e ."))
             *  //{
             *  //    curLine = curLine.Replace("i .e .", "i,e,");
             *  //}
             *  //if (curLine.Contains("vs."))
             *  //{
             *  //    curLine = curLine.Replace("vs.","vs,");
             *  //}
             * // Regex sentencePattern = new Regex("(?<=[.?!])\\s+(?=[a-zA-Z])");
             *
             *
             * // string[] seperator = {"."};
             *  string[] pieces = Regex.Split(curLine, "(?<=[.?!])\\s+(?=[a-zA-Z])");
             * // string[] pieces = curLine.Split(seperator,StringSplitOptions.RemoveEmptyEntries);
             *  foreach (string piece in pieces)
             *  {
             *      if (piece.Contains("appendix") || piece.Contains("standard") || piece.Contains("figure") || piece.Contains("table") || piece.Contains("section")  || piece.Contains("version") || piece.Contains("http") || piece.Contains("www") || piece.Contains("error! reference"))
             *      {
             *          continue;
             *      }
             *      else if (piece.StartsWith("acronym") || piece.StartsWith("definition") || piece.EndsWith("as shown in fig") || piece.StartsWith("chapter"))
             *      {
             *          continue;
             *      }
             *
             *      //if filteredPiece is the substring of piece, starting of which is a character, actually I can use regex to replace it
             *      /////////////////////////////////////////////////////////////////////////////////////////////////////
             *      //add for the content without clean operation beforehand
             *      string filteredPiece = piece.Trim();
             *      for (int i = 0; i < piece.Length; i++)
             *      {
             *          char curChar = piece[i];
             *          if ((curChar > 'z' || curChar < 'a') && (curChar > 'Z' || curChar < 'A'))
             *          {
             *              continue;
             *          }
             *          else
             *          {
             *              filteredPiece = piece.Substring(i).Trim();
             *              break;
             *          }
             *      }
             *
             *      if (filteredPiece.Length > 1)
             *      {
             *          char endChar = filteredPiece[filteredPiece.Length - 1];
             *          if (endChar >= '0' && endChar <= '9')
             *          {
             *              continue;
             *          }
             *      }
             *
             *      if(funnySign.Match(filteredPiece).Success)
             *      {
             *          filteredPiece = funnySign.Replace(filteredPiece, " ");
             *      }
             *
             *      if (IsSynonym(filteredPiece)) //if the sentence is to define the synonym, ignore it
             *      {
             *          continue;
             *      }
             *
             *      if (IsSectionHead(filteredPiece))
             *      {
             *          continue;
             *      }
             *
             *
             *
             *      if (ContainEnoughInfo(filteredPiece) && !filteredPiece.Contains("the following") && !filteredPiece.StartsWith("commentary") && !filteredPiece.StartsWith("see") && !filteredPiece.StartsWith("of") && !filteredPiece.StartsWith("description:") && !filteredPiece.StartsWith("content"))
             *      {
             *          sentences.Add(filteredPiece);
             *      }
             *  }
             * }
             * return sentences;*/
        }