private void CalDocDensityMap(string[] fileEntities) { foreach (string fileEntity in fileEntities) { string fileName = FileNameParse.GetFileName(fileEntity); if (!fileNameTopicDensityMap.ContainsKey(fileName)) { string fileContent = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower(); char[] delimiters = new char[] { ' ' }; string[] tmpList = fileContent.Split(delimiters, StringSplitOptions.RemoveEmptyEntries); List <string> fileTermList = new List <string>(); foreach (string tmpTerm in tmpList) //collect the terms in file { if (tmpTerm.Length > 1) { fileTermList.Add(tmpTerm); } } if (fileTermList.Count == 0) { break; } Dictionary <string, float> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file fileNameTopicDensityMap.Add(fileName, topicDensity); docTopicDensityMap.Add(fileEntity, topicDensity); } else { Dictionary <string, float> topicDensity = fileNameTopicDensityMap[fileName]; docTopicDensityMap.Add(fileEntity, topicDensity); } } }
/* * calculate the density per document */ //find the density of all topics in each document //files in two levels public void DoClumpingRank() { if (File.Exists(rankResult)) { File.Delete(rankResult); } string topcTermRawData = FileOperators.ReadFileText(topicTermsFilePath); if (string.IsNullOrEmpty(topcTermRawData)) { return; } string topicTermContent = topcTermRawData.ToLower(); topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent); docTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); string[] fileEntities = Directory.GetFiles(txtCleanFileDir); CalDocDensityMap(fileEntities); string[] dirs = Directory.GetDirectories(txtCleanFileDir); foreach (string dir in dirs) { string[] subFileEntities = Directory.GetFiles(dir); CalDocDensityMap(subFileEntities); } WriteRankingResult(); }
/* * calculate the density per document */ //find the density of all components in each document. //files in two levels public void DoClumpingRank(BackgroundWorker backgroundWorker) { if (File.Exists(rankResult)) { File.Delete(rankResult); } OutputMg.OutputContent(backgroundWorker, "Start parsing topic terms"); string topicTermContent = FileOperators.ReadFileText(topicTermsFilePath).ToLower(); topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent); OutputMg.OutputContent(backgroundWorker, "Finished parsing topic terms."); OutputMg.OutputContent(backgroundWorker, "Start ranking topic"); docTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); string[] fileEntities = Directory.GetFiles(txtCleanFileDir); CalDocDensityMap(fileEntities); string[] dirs = Directory.GetDirectories(txtCleanFileDir); foreach (string dir in dirs) { string[] subFileEntities = Directory.GetFiles(dir); //CalDocDensityMap(subFileEntities); CalDocDensityMap(subFileEntities); } OutputMg.OutputContent(backgroundWorker, "Finished ranking topic"); OutputMg.OutputContent(backgroundWorker, "Start writing ranking topic"); WriteRankingResult(); OutputMg.OutputContent(backgroundWorker, "Finished writing ranking topic"); }
/*the number of times a term occurs in a document * for one term and one doc. */ private int tf(string term, string filePath) { int freq = 0; string fileContent = FileOperators.ReadFileText(filePath); freq = (fileContent.Length - fileContent.Replace(term, "").Length) / term.Length; return(freq); }
/* * parse component and the related files from the results of ranking */ private void ParseCompFiles(string compRelatedFiles) { string fileContent = FileOperators.ReadFileText(compRelatedFiles); string[] seperators = new string[] { "\r\n\r\n" }; string[] compChunks = fileContent.Split(seperators, StringSplitOptions.RemoveEmptyEntries); foreach (string compChunk in compChunks) { string[] lineSeperator = new string[] { "\r\n" }; string[] chunkLines = compChunk.Split(lineSeperator, StringSplitOptions.RemoveEmptyEntries); string firstLine = chunkLines[0]; string compName = ""; List <string> relatedFiles = new List <string>(); if (firstLine.Contains('\t')) { string[] lineTerms = firstLine.Split('\t'); compName = lineTerms[0]; string fileName = lineTerms[1]; if (fileName.Contains(@"\")) { int fileNameIndex = fileName.LastIndexOf(@"\"); fileName = fileName.Substring(fileNameIndex + 1); } relatedFiles.Add(fileName); } else { Console.WriteLine("wrong in the compFile! " + firstLine); } int lineScale = chunkLines.Count(); for (int i = 1; i < lineScale; i++) { string curLine = chunkLines[i]; if (curLine.Contains('\t')) { string[] lineTerms = curLine.Split('\t'); string fileName = lineTerms[1]; if (fileName.Contains(@"\")) { int fileNameIndex = fileName.LastIndexOf(@"\"); fileName = fileName.Substring(fileNameIndex + 1); } relatedFiles.Add(fileName); } else { continue; } } compFiles.Add(compName, relatedFiles); } }
/*parse the summary.txt file of the standford tmt results */ public static void Execute(string sourceFilePath, string destFilePath) { if (File.Exists(destFilePath)) { File.Delete(destFilePath); } string topicContent = FileOperators.ReadFileText(sourceFilePath); string[] stringSeparators = new string[] { "\r\n\n\r\n" }; string[] topicParts = topicContent.Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries); foreach (string topicPart in topicParts) { string[] topicPartSeparators = new string[] { "\r\n" }; string[] lines = topicPart.Split(topicPartSeparators, StringSplitOptions.RemoveEmptyEntries); string singleTopicContent = ""; float topicSumWeight = 0f; foreach (string line in lines) { if (line.StartsWith("Topic")) { int spaceIndex = line.IndexOf("\t\t"); string topicId = line.Substring(0, spaceIndex); string topicWeight = line.Substring(spaceIndex + 2); topicSumWeight = float.Parse(topicWeight); singleTopicContent += topicId + ":"; } else { string trimedLine = line.Trim(); string[] terms = trimedLine.Split('\t'); string term = terms[0]; string relevance = terms[1]; float relevanceValue = float.Parse(relevance); relevanceValue = relevanceValue / topicSumWeight; singleTopicContent += term + "," + relevanceValue + ";"; } } if (singleTopicContent != "") { singleTopicContent = singleTopicContent.Remove(singleTopicContent.Length - 1); FileOperators.FileAppend(destFilePath, singleTopicContent); } } }
private void CalDocDensityMap(string[] fileEntities) { foreach (string fileEntity in fileEntities) { int lastSlashIndex = fileEntity.LastIndexOf('\\'); string fileName = fileEntity.Substring(lastSlashIndex + 1); if (!fileNameTopicDensityMap.ContainsKey(fileName)) { string fileContent = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower(); List <string> fileTermList = new List <string>(fileContent.Split(' ')); Dictionary <string, int> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file fileNameTopicDensityMap.Add(fileName, topicDensity); docTopicDensityMap.Add(fileEntity, topicDensity); } else { Dictionary <string, int> topicDensity = fileNameTopicDensityMap[fileName]; docTopicDensityMap.Add(fileEntity, topicDensity); } } }
/* * oriPDFPath: the pdf documents are stored in a directory */ public void CountDocRelativeSeq(string resultFile, string oriPDFPath) { //parse component and the related files Dictionary <string, List <string> > compFiles = new Dictionary <string, List <string> >(); foreach (string compDir in Directory.GetDirectories(oriPDFPath)) { string compName = FileNameParse.GetFileName(compDir); List <string> fileList = new List <string>(); foreach (string filePath in Directory.GetFiles(compDir)) { string fileName = FileNameParse.GetFileName(filePath); fileList.Add(fileName); } compFiles.Add(compName, fileList); } //read the result file string resultFileContent = FileOperators.ReadFileText(resultFile); string[] separators = new string[] { "\r\n\r\n" }; string[] compChunks = resultFileContent.Split(separators, StringSplitOptions.RemoveEmptyEntries); foreach (string compChunk in compChunks) { string[] lineSeparator = new string[] { "\r\n" }; string[] lines = compChunk.Split(lineSeparator, StringSplitOptions.RemoveEmptyEntries); string firstLine = lines[0]; if (firstLine.Contains("\t")) { string[] lineTerms = firstLine.Split('\t'); string compName = lineTerms[0]; if (compName.Contains(",")) { int commaIndex = compName.IndexOf(','); string[] candidateComps = compName.Split(','); int candidateScale = candidateComps.Count(); bool matched = false; int candidateIndex = 0; while (!matched && candidateIndex < candidateScale) { string candidate = candidateComps[candidateIndex].Trim(); foreach (string fileComp in compFiles.Keys) { if (candidate.Equals(fileComp.ToLower())) { List <string> targetFiles = compFiles[fileComp]; ReadResultFile(fileComp, lines, targetFiles); matched = true; break; } } candidateIndex++; } } else { foreach (string fileComp in compFiles.Keys) { if (fileComp.Equals(compName)) { List <string> targetFiles = compFiles[fileComp]; ReadResultFile(fileComp, lines, targetFiles); break; } } } } } }
/* * extract the component-related paragraphs according to the component-related files * * */ private void ExtractCompParagraphs(string compRelatedFile, string paras, string storePath) { Regex regex = new Regex(@"[^a-zA-Z]"); foreach (string comp in compFiles.Keys) { string compParagraphs = ""; List <string> relatedFiles = compFiles[comp]; List <string> relatedContent = new List <string>(); if (compTerms.ContainsKey(comp)) { Dictionary <string, float> compTermWeight = compTerms[comp]; foreach (string file in relatedFiles) { string paraFile = paras + "\\" + file; string realStorePath = paraFile; if (File.Exists(paraFile)) { //do nothing } else { string tmpPath = paraFile.Replace(" ", "-"); if (File.Exists(tmpPath)) { realStorePath = tmpPath; } } if (File.Exists(realStorePath)) { string fileContent = FileOperators.ReadFileText(realStorePath); string[] seperators = new string[] { "\r\n\r\n" }; string[] paraChunks = fileContent.Split(seperators, StringSplitOptions.RemoveEmptyEntries); // string[] fileLines = FileOperators.ReadFileLines(realStorePath); foreach (string tmpPara in paraChunks) { if (tmpPara.Trim().Length == 0) { continue; } else { string lowerCaseLine = tmpPara.ToLower(); if (IsCompRelated(lowerCaseLine, compTermWeight)) { string pureWords = regex.Replace(lowerCaseLine, ""); if (!relatedContent.Contains(pureWords)) { relatedContent.Add(pureWords); lowerCaseLine = lowerCaseLine.Replace("\r\n", ""); compParagraphs += lowerCaseLine + "\r\n"; } } } } } else { Console.WriteLine("Please check the existence of file: " + file); } } FileOperators.FileWrite(storePath + comp + ".txt", compParagraphs); } } }
public List <string> SplitSingleFileSentence(string filePath) { //result sentences List <string> sentences = new List <string>(); Regex funnySign = new Regex("[^\\w\\d\\p{P}\\s]"); //split file into paragraphs firstly string fileContent = FileOperators.ReadFileText(filePath); string[] paras = fileContent.Split(new char[] { '\r', '\n' }); foreach (string tmpPara in paras) { if (tmpPara.Trim().Length == 0) { continue; } else if (!tmpPara.Contains(",") && !tmpPara.Contains(".") && !tmpPara.Contains("!") && !tmpPara.Contains(";") && !tmpPara.Contains(":")) { continue; } string[] pieces = Regex.Split(tmpPara, "(?<=[.?!])\\s+(?=[a-zA-Z])"); // string[] pieces = curLine.Split(seperator,StringSplitOptions.RemoveEmptyEntries); foreach (string piece in pieces) { if (piece.Contains("appendix") || piece.Contains("standard") || piece.Contains("figure") || piece.Contains("table") || piece.Contains("section") || piece.Contains("version") || piece.Contains("http") || piece.Contains("www") || piece.Contains("error! reference")) { continue; } else if (piece.StartsWith("acronym") || piece.StartsWith("definition") || piece.EndsWith("as shown in fig") || piece.StartsWith("chapter")) { continue; } //if filteredPiece is the substring of piece, starting of which is a character, actually I can use regex to replace it ///////////////////////////////////////////////////////////////////////////////////////////////////// //add for the content without clean operation beforehand string filteredPiece = piece.Trim(); for (int i = 0; i < piece.Length; i++) { char curChar = piece[i]; if ((curChar > 'z' || curChar < 'a') && (curChar > 'Z' || curChar < 'A')) { continue; } else { filteredPiece = piece.Substring(i).Trim(); break; } } if (filteredPiece.Length > 1) { char endChar = filteredPiece[filteredPiece.Length - 1]; if (endChar >= '0' && endChar <= '9') { continue; } } if (funnySign.Match(filteredPiece).Success) { filteredPiece = funnySign.Replace(filteredPiece, " "); } if (IsSynonym(filteredPiece)) //if the sentence is to define the synonym, ignore it { continue; } if (IsSectionHead(filteredPiece)) { continue; } if (ContainEnoughInfo(filteredPiece) && !filteredPiece.Contains("the following") && !filteredPiece.StartsWith("commentary") && !filteredPiece.StartsWith("see") && !filteredPiece.StartsWith("of") && !filteredPiece.StartsWith("description:") && !filteredPiece.StartsWith("content")) { sentences.Add(filteredPiece); } } } return(sentences); /* * string[] fileLines = FileOperators.ReadFileLines(filePath); * * // Regex funnySign = new Regex("[^a-zA-Z0-9.,;:()\"\'/-]"); * Regex funnySign = new Regex("[^\\w\\d\\p{P}\\s]"); * if (fileLines == null) * { * return null; * } * * foreach (string aLine in fileLines) * { * if (aLine.Contains("&")) * { * continue; * } * * if (!aLine.Contains(",") && !aLine.Contains(".") && !aLine.Contains("!") && !aLine.Contains(";") && !aLine.Contains(":")) * { * continue; * } * * string curLine = aLine; * * //if (curLine.Contains("e.g.")) * //{ * // curLine = curLine.Replace("e.g.", "e,g,"); * //} * //if (curLine.Contains("e .g .")) * //{ * // curLine = curLine.Replace("e .g .", "e,g,"); * //} * //if (curLine.Contains("i.e.") ) * //{ * // curLine = curLine.Replace("i.e.", "i,e,"); * //} * //if (curLine.Contains("i .e .")) * //{ * // curLine = curLine.Replace("i .e .", "i,e,"); * //} * //if (curLine.Contains("vs.")) * //{ * // curLine = curLine.Replace("vs.","vs,"); * //} * // Regex sentencePattern = new Regex("(?<=[.?!])\\s+(?=[a-zA-Z])"); * * * // string[] seperator = {"."}; * string[] pieces = Regex.Split(curLine, "(?<=[.?!])\\s+(?=[a-zA-Z])"); * // string[] pieces = curLine.Split(seperator,StringSplitOptions.RemoveEmptyEntries); * foreach (string piece in pieces) * { * if (piece.Contains("appendix") || piece.Contains("standard") || piece.Contains("figure") || piece.Contains("table") || piece.Contains("section") || piece.Contains("version") || piece.Contains("http") || piece.Contains("www") || piece.Contains("error! reference")) * { * continue; * } * else if (piece.StartsWith("acronym") || piece.StartsWith("definition") || piece.EndsWith("as shown in fig") || piece.StartsWith("chapter")) * { * continue; * } * * //if filteredPiece is the substring of piece, starting of which is a character, actually I can use regex to replace it * ///////////////////////////////////////////////////////////////////////////////////////////////////// * //add for the content without clean operation beforehand * string filteredPiece = piece.Trim(); * for (int i = 0; i < piece.Length; i++) * { * char curChar = piece[i]; * if ((curChar > 'z' || curChar < 'a') && (curChar > 'Z' || curChar < 'A')) * { * continue; * } * else * { * filteredPiece = piece.Substring(i).Trim(); * break; * } * } * * if (filteredPiece.Length > 1) * { * char endChar = filteredPiece[filteredPiece.Length - 1]; * if (endChar >= '0' && endChar <= '9') * { * continue; * } * } * * if(funnySign.Match(filteredPiece).Success) * { * filteredPiece = funnySign.Replace(filteredPiece, " "); * } * * if (IsSynonym(filteredPiece)) //if the sentence is to define the synonym, ignore it * { * continue; * } * * if (IsSectionHead(filteredPiece)) * { * continue; * } * * * * if (ContainEnoughInfo(filteredPiece) && !filteredPiece.Contains("the following") && !filteredPiece.StartsWith("commentary") && !filteredPiece.StartsWith("see") && !filteredPiece.StartsWith("of") && !filteredPiece.StartsWith("description:") && !filteredPiece.StartsWith("content")) * { * sentences.Add(filteredPiece); * } * } * } * return sentences;*/ }