private void AddUserSearchTerms(List<string> topicTerms) { string extraSearchTermPath = Configures.GetManualSearchTermPath(); if (!File.Exists(extraSearchTermPath)) { return; } else { string[] lines = FileOperators.ReadFileLines(extraSearchTermPath); foreach (string line in lines) { if (line.Contains(":")) { int commaIndex = line.IndexOf(":"); string compName = line.Substring(0, commaIndex); if (compName.Equals(targetTopicName)) { string extraTermStr = line.Substring(commaIndex + 1); string[] extraTerms = extraTermStr.Split(','); foreach (string extraTerm in extraTerms) { string trimmedTerm = extraTerm.Trim(); if (!topicTerms.Contains(trimmedTerm)) { topicTerms.Add(trimmedTerm); } } } } } } }
public static void ExecuteWordExtraction(string sourceFileName, string destFileName) { TextExtractor extractor = new TextExtractor(sourceFileName); string wordText = extractor.ExtractText(); FileOperators.FileWrite(destFileName, wordText); }
private void CalDocDensityMap(string[] fileEntities) { foreach (string fileEntity in fileEntities) { string fileName = FileNameParse.GetFileName(fileEntity); if (!fileNameTopicDensityMap.ContainsKey(fileName)) { string fileContent = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower(); char[] delimiters = new char[] { ' ' }; string[] tmpList = fileContent.Split(delimiters, StringSplitOptions.RemoveEmptyEntries); List <string> fileTermList = new List <string>(); foreach (string tmpTerm in tmpList) //collect the terms in file { if (tmpTerm.Length > 1) { fileTermList.Add(tmpTerm); } } if (fileTermList.Count == 0) { break; } Dictionary <string, float> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file fileNameTopicDensityMap.Add(fileName, topicDensity); docTopicDensityMap.Add(fileEntity, topicDensity); } else { Dictionary <string, float> topicDensity = fileNameTopicDensityMap[fileName]; docTopicDensityMap.Add(fileEntity, topicDensity); } } }
public static void ExecuteExtraction(string sourceFileName, string destFileName) { Application _application = new Microsoft.Office.Interop.PowerPoint.Application(); var pres = _application.Presentations; Presentation pptFile = pres.Open(sourceFileName, MsoTriState.msoFalse, MsoTriState.msoFalse, MsoTriState.msoFalse); string storeContent = ""; int slideCount = pptFile.Slides.Count; for (int i = 1; i <= slideCount; i++) { Slide slide = pptFile.Slides[i]; slide.FollowMasterBackground = MsoTriState.msoFalse; foreach (var item in slide.Shapes) { var shape = (Microsoft.Office.Interop.PowerPoint.Shape)item; if (shape.HasTextFrame == MsoTriState.msoTrue) { //shape.Fill.ForeColor.RGB = System.Drawing.ColorTranslator.ToWin32(Color.Red); var textRange = shape.TextFrame.TextRange; var text = textRange.Text; storeContent += text + " "; } } } FileOperators.FileWrite(destFileName, storeContent); pptFile.Close(); _application.Quit(); }
/*update here in future. We didnot consider the frequency now! */ private List <string> FilterNGram() { List <string> ngrams = new List <string>(); string[] ngramLines = FileOperators.ReadFileLines(n_gramFile); foreach (string ngram in ngramLines) { int spaceIndex = ngram.IndexOf(" "); if (spaceIndex > -1) { string textFreqStr = ngram.Substring(0, spaceIndex); string[] separators = { "<>" }; string[] terms = textFreqStr.Split(separators, StringSplitOptions.RemoveEmptyEntries); if (terms.Length == 3) { string first = terms[0]; string second = terms[1]; string freqStr = terms[2]; if (first.Length > 2 && second.Length > 2) { float freq = float.Parse(freqStr); ngrams.Add(first + "<>" + second); } } else { Console.WriteLine("check!"); } } } return(ngrams); }
public static Dictionary <string, Dictionary <string, float> > ParseTopicTerms(string topicTermsPath) { Dictionary <string, Dictionary <string, float> > topicTerms = new Dictionary <string, Dictionary <string, float> >(); string[] topicLines = FileOperators.ReadFileLines(topicTermsPath); foreach (string line in topicLines) { if (line.Length == 0) { continue; } int colonIndex = line.IndexOf(":"); string topicID = line.Substring(0, colonIndex); Dictionary <string, float> termValueMap = new Dictionary <string, float>(); string termValuePart = line.Substring(colonIndex + 1); string[] terms = termValuePart.Split(';'); foreach (string termValue in terms) { string[] termAndValue = termValue.Split(','); string term = termAndValue[0].Trim().ToLower(); string value = termAndValue[1]; if (!termValueMap.ContainsKey(term)) { termValueMap.Add(term, float.Parse(value)); } } topicTerms.Add(topicID, termValueMap); } return(topicTerms); }
private static Dictionary <string, Dictionary <string, double> > ParseTopicTerms(string topicTermsFileName) { Dictionary <string, Dictionary <string, double> > topicTermPropMap = new Dictionary <string, Dictionary <string, double> >(); string[] topicTermLines = FileOperators.ReadFileLines(topicTermsFileName); for (int index = 0; index < topicTermLines.Length; index++) { string line = topicTermLines[index]; int colonIndex = line.IndexOf(':'); string topicName = line.Substring(0, colonIndex); string termValues = line.Substring(colonIndex + 1); Dictionary <string, double> termProps = new Dictionary <string, double>(); if (termValues.Contains(";")) { string[] termValueList = termValues.Split(';'); foreach (string termValuePair in termValueList) { int commaIndex = termValuePair.IndexOf(','); string term = termValuePair.Substring(0, commaIndex); string propStr = termValuePair.Substring(commaIndex + 1); termProps.Add(term, double.Parse(propStr)); } } if (!topicTermPropMap.ContainsKey(topicName)) { topicTermPropMap.Add(topicName, termProps); } } return(topicTermPropMap); }
//too slow, deprecated public static void ExecuteExtraction(string sourceFileName, string destFileName) { Application _application = new Application(); object docFilenameAsObject = sourceFileName; Document _document = _application.Documents.Open(ref docFilenameAsObject); string docContent = ""; try { int paraNum = _document.Paragraphs.Count; foreach (Paragraph para in _document.Paragraphs) { Range paraRange = para.Range; docContent += paraRange.Text; } FileOperators.FileWrite(destFileName, docContent); ((_Document)_document).Close(); ((_Application)_application).Quit(); } catch (Exception) { ((_Document)_document).Close(); ((_Application)_application).Quit(); } }
/* * calculate the density per document */ //find the density of all topics in each document //files in two levels public void DoClumpingRank() { if (File.Exists(rankResult)) { File.Delete(rankResult); } string topcTermRawData = FileOperators.ReadFileText(topicTermsFilePath); if (string.IsNullOrEmpty(topcTermRawData)) { return; } string topicTermContent = topcTermRawData.ToLower(); topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent); docTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); string[] fileEntities = Directory.GetFiles(txtCleanFileDir); CalDocDensityMap(fileEntities); string[] dirs = Directory.GetDirectories(txtCleanFileDir); foreach (string dir in dirs) { string[] subFileEntities = Directory.GetFiles(dir); CalDocDensityMap(subFileEntities); } WriteRankingResult(); }
private static Dictionary <string, List <string> > ParseTopicTerms(string topicTermFilePath) { Dictionary <string, List <string> > topicTerms = new Dictionary <string, List <string> >(); string[] topicTermLines = FileOperators.ReadFileLines(topicTermFilePath); foreach (string line in topicTermLines) { int colonIndex = line.IndexOf(':'); string topicID = line.Substring(0, colonIndex); string termValues = line.Substring(colonIndex + 1); List <string> terms = new List <string>(); if (termValues.Contains(";")) { string[] termValueList = termValues.Split(';'); foreach (string termValuePair in termValueList) { int commaIndex = termValuePair.IndexOf(','); string term = termValuePair.Substring(0, commaIndex); terms.Add(term); } } topicTerms.Add(topicID, terms); } return(topicTerms); }
/* * calculate the density per document */ //find the density of all components in each document. //files in two levels public void DoClumpingRank(BackgroundWorker backgroundWorker) { if (File.Exists(rankResult)) { File.Delete(rankResult); } OutputMg.OutputContent(backgroundWorker, "Start parsing topic terms"); string topicTermContent = FileOperators.ReadFileText(topicTermsFilePath).ToLower(); topicTerms = ParseTopicTerms.GetTopicTermValueList(topicTermContent); OutputMg.OutputContent(backgroundWorker, "Finished parsing topic terms."); OutputMg.OutputContent(backgroundWorker, "Start ranking topic"); docTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); fileNameTopicDensityMap = new Dictionary <string, Dictionary <string, float> >(); string[] fileEntities = Directory.GetFiles(txtCleanFileDir); CalDocDensityMap(fileEntities); string[] dirs = Directory.GetDirectories(txtCleanFileDir); foreach (string dir in dirs) { string[] subFileEntities = Directory.GetFiles(dir); //CalDocDensityMap(subFileEntities); CalDocDensityMap(subFileEntities); } OutputMg.OutputContent(backgroundWorker, "Finished ranking topic"); OutputMg.OutputContent(backgroundWorker, "Start writing ranking topic"); WriteRankingResult(); OutputMg.OutputContent(backgroundWorker, "Finished writing ranking topic"); }
/* Parse the terms and their weights from the TopicManualTerms.txt */ private void ParseCompTerms(string searchTerms) { string[] compLines = FileOperators.ReadFileLines(searchTerms); foreach (string aLine in compLines) { if (aLine.Contains(":")) { int commaIndex = aLine.IndexOf(":"); string compName = aLine.Substring(0, commaIndex); string termParts = aLine.Substring(commaIndex + 1); string[] termWeights = termParts.Split(';'); Dictionary <string, float> termWeightDic = new Dictionary <string, float>(); foreach (string termWeight in termWeights) { if (termWeight.Contains(",")) { string[] tmpTerms = termWeight.Split(','); string term = tmpTerms[0]; string weightStr = tmpTerms[1]; float weight = float.Parse(weightStr); termWeightDic.Add(term, weight); } else { continue; } } compTerms.Add(compName, termWeightDic); } else { continue; } } }
public static List <string> ParseTopicTerms(string topicTermPath, string targetTopicName) { List <string> topicTerms = new List <string>(); string[] topicTermLines = FileOperators.ReadFileLines(topicTermPath); foreach (string line in topicTermLines) { if (line.StartsWith(targetTopicName)) { int colonIndex = line.IndexOf(':'); string termValues = line.Substring(colonIndex + 1); if (termValues.Contains(";")) { string[] termValueList = termValues.Split(';'); foreach (string termValuePair in termValueList) { int commaIndex = termValuePair.IndexOf(','); string term = termValuePair.Substring(0, commaIndex).ToLower().Trim(); topicTerms.Add(term); } } else { int commaIndex = termValues.IndexOf(','); string term = termValues.Substring(0, commaIndex).ToLower().Trim(); topicTerms.Add(term); } } } return(topicTerms); }
/* store the stemmed file as the original hierarchy */ public void FileDirStemming(string sourceFile, string stemmedPath) { if (IsDirectory(sourceFile)) { string[] subDirs = Directory.GetDirectories(sourceFile); foreach (string subDir in subDirs) { string subDirName = FileNameParse.GetFileName(subDir); string storePath = stemmedPath + "\\" + subDirName; FileDirStemming(subDir, storePath); } string[] subFiles = Directory.GetFiles(sourceFile); foreach (string subFile in subFiles) { string[] fileLines = FileOperators.ReadFileLines(subFile); string fileName = FileNameParse.GetFileName(subFile); string stemmedContent = ""; foreach (string fileLine in fileLines) { string stemmedLine = ""; string[] separators = { " ", "," }; string[] terms = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries); Porter2 porter = new Porter2(); foreach (string term in terms) { string stemmedTerm = porter.stem(term); stemmedLine += stemmedTerm + " "; } stemmedContent += stemmedLine + "\r\n"; } FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent); } } }
/*the number of times a term occurs in a document * for one term and one doc. */ private int tf(string term, string filePath) { int freq = 0; string fileContent = FileOperators.ReadFileText(filePath); freq = (fileContent.Length - fileContent.Replace(term, "").Length) / term.Length; return(freq); }
public static void ExecuteExtraction(string sourceFileName, string destFileName) { Application _application = new Application(); Workbook oriWorkBook = _application.Workbooks.Open(sourceFileName); int sheetNum = oriWorkBook.Worksheets.Count; for (int i = 1; i <= sheetNum; i++) { string fileContent = ""; Worksheet xlWorkSheet = oriWorkBook.Worksheets[i]; // Detect Last used Row - Ignore cells that contains formulas that result in blank values int lastRowIgnoreFormulas = xlWorkSheet.Cells.Find( "*", System.Reflection.Missing.Value, Microsoft.Office.Interop.Excel.XlFindLookIn.xlValues, Microsoft.Office.Interop.Excel.XlLookAt.xlWhole, Microsoft.Office.Interop.Excel.XlSearchOrder.xlByRows, Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious, false, System.Reflection.Missing.Value, System.Reflection.Missing.Value).Row; // Detect Last Used Column - Ignore cells that contains formulas that result in blank values int lastColIgnoreFormulas = xlWorkSheet.Cells.Find( "*", System.Reflection.Missing.Value, System.Reflection.Missing.Value, System.Reflection.Missing.Value, Microsoft.Office.Interop.Excel.XlSearchOrder.xlByColumns, Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious, false, System.Reflection.Missing.Value, System.Reflection.Missing.Value).Column; // Detect Last used Row / Column - Including cells that contains formulas that result in blank values for (int j = 1; j <= lastRowIgnoreFormulas; j++) { for (int k = 1; k <= lastColIgnoreFormulas; k++) { Range usedRange = xlWorkSheet.Cells[j, k]; string rangeText = usedRange.Text; fileContent += rangeText + " "; } } FileOperators.FileAppend(destFileName, fileContent); } oriWorkBook.Close(XlSaveAction.xlDoNotSaveChanges); _application.Application.Quit(); }
/* * parse component and the related files from the results of ranking */ private void ParseCompFiles(string compRelatedFiles) { string fileContent = FileOperators.ReadFileText(compRelatedFiles); string[] seperators = new string[] { "\r\n\r\n" }; string[] compChunks = fileContent.Split(seperators, StringSplitOptions.RemoveEmptyEntries); foreach (string compChunk in compChunks) { string[] lineSeperator = new string[] { "\r\n" }; string[] chunkLines = compChunk.Split(lineSeperator, StringSplitOptions.RemoveEmptyEntries); string firstLine = chunkLines[0]; string compName = ""; List <string> relatedFiles = new List <string>(); if (firstLine.Contains('\t')) { string[] lineTerms = firstLine.Split('\t'); compName = lineTerms[0]; string fileName = lineTerms[1]; if (fileName.Contains(@"\")) { int fileNameIndex = fileName.LastIndexOf(@"\"); fileName = fileName.Substring(fileNameIndex + 1); } relatedFiles.Add(fileName); } else { Console.WriteLine("wrong in the compFile! " + firstLine); } int lineScale = chunkLines.Count(); for (int i = 1; i < lineScale; i++) { string curLine = chunkLines[i]; if (curLine.Contains('\t')) { string[] lineTerms = curLine.Split('\t'); string fileName = lineTerms[1]; if (fileName.Contains(@"\")) { int fileNameIndex = fileName.LastIndexOf(@"\"); fileName = fileName.Substring(fileNameIndex + 1); } relatedFiles.Add(fileName); } else { continue; } } compFiles.Add(compName, relatedFiles); } }
private void WriteDomainTerms(Dictionary <string, List <string> > componentTerms) { string domainTermContent = ""; //write component level info foreach (string component in componentTerms.Keys) { List <string> terms = componentTerms[component]; if (terms.Count == 0) { continue; } //float termCount = terms.Count; //float prop = 1/termCount; domainTermContent += component + ":"; if (component.Contains(",")) { string[] componentNames = component.Split(','); foreach (string name in componentNames) { domainTermContent += name + "," + 1.0 + ";"; } } else { domainTermContent += component + "," + 1.0 + ";"; } foreach (string term in terms) { domainTermContent += term + "," + 0.2 + ";"; } if (domainTermContent.EndsWith(";")) { domainTermContent = domainTermContent.Remove(domainTermContent.Length - 1).ToLower() + "\r\n"; } else { domainTermContent += "\r\n"; } } if (domainTermContent.EndsWith("\r\n")) { domainTermContent = domainTermContent.Remove(domainTermContent.Length - 2).ToLower(); } FileOperators.FileAppend(fixedFormatPath, domainTermContent); }
public static void ExecuteExtraction(string sourceFileName, string destFileName, BackgroundWorker backgroundWorker = null) { PdfReader reader = new PdfReader(sourceFileName); int pdfNum = reader.NumberOfPages; for (int i = 1; i <= pdfNum; i++) { var textPos = new MyLocationExtractionStrategy(); //Create an instance of our strategy string ex = PdfTextExtractor.GetTextFromPage(reader, i, textPos); //store the text and the position info in textPos FileOperators.FileAppend(destFileName, ex); } }
private Dictionary <string, string> mapTopicIDName() { Dictionary <string, string> IDNameMap = new Dictionary <string, string>(); if (this.topicNamePath.Length > 0) { string[] fileContent = FileOperators.ReadFileLines(this.topicNamePath); foreach (string line in fileContent) { string[] terms = line.Split(':'); string topicID = terms[0]; string topicName = terms[1]; IDNameMap.Add(topicID, topicName); } } return(IDNameMap); }
private void CleanSingleFile(string sourceFilePath, string destFileName) { string fileName = FileNameParse.GetFileName(sourceFilePath); string[] oriLines = FileOperators.ReadFileLines(sourceFilePath); List <string> filterDecorate = DeleteDecorate(oriLines);//contents, list of figures, list of tables, appendix //SplitParagraph(filterDecorate); List <string> cleanedText = DetailClean(filterDecorate); List <string> furtherClean = CleanMoreSpace(cleanedText);//filter the more consecutive empty space in each line List <string> mergedContent = MergeContent(furtherClean); string filteredContent = String.Join("\r\n", mergedContent); FileOperators.FileWrite(destFileName, filteredContent); }
private void getAllTopicTerms() { Dictionary <string, Dictionary <string, string> > topicTerms = new Dictionary <string, Dictionary <string, string> >(); string[] topicLines = FileOperators.ReadFileLines(topicTermsPath); Dictionary <string, string> topicIDNameMap = mapTopicIDName(); foreach (string line in topicLines) { if (line.Length == 0) { continue; } int colonIndex = line.IndexOf(":"); string topicID = line.Substring(0, colonIndex); string topicName; if (topicIDNameMap.ContainsKey(topicID)) { topicName = topicIDNameMap[topicID]; } else { topicName = topicID; } Dictionary <string, string> termValueMap = new Dictionary <string, string>(); string termValuePart = line.Substring(colonIndex + 1); string[] terms = termValuePart.Split(';'); foreach (string termValue in terms) { string[] termAndValue = termValue.Split(','); string term = termAndValue[0].Trim().ToLower(); string value = termAndValue[1]; if (!termValueMap.ContainsKey(term)) { termValueMap.Add(term, value); } } topicTerms.Add(topicName, termValueMap); } NormalizeTopicRelevance normalizer = new NormalizeTopicRelevance(); normalizedTopicTerms = normalizer.DoNormalize(topicTerms); }
/*parse the summary.txt file of the standford tmt results */ public static void Execute(string sourceFilePath, string destFilePath) { if (File.Exists(destFilePath)) { File.Delete(destFilePath); } string topicContent = FileOperators.ReadFileText(sourceFilePath); string[] stringSeparators = new string[] { "\r\n\n\r\n" }; string[] topicParts = topicContent.Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries); foreach (string topicPart in topicParts) { string[] topicPartSeparators = new string[] { "\r\n" }; string[] lines = topicPart.Split(topicPartSeparators, StringSplitOptions.RemoveEmptyEntries); string singleTopicContent = ""; float topicSumWeight = 0f; foreach (string line in lines) { if (line.StartsWith("Topic")) { int spaceIndex = line.IndexOf("\t\t"); string topicId = line.Substring(0, spaceIndex); string topicWeight = line.Substring(spaceIndex + 2); topicSumWeight = float.Parse(topicWeight); singleTopicContent += topicId + ":"; } else { string trimedLine = line.Trim(); string[] terms = trimedLine.Split('\t'); string term = terms[0]; string relevance = terms[1]; float relevanceValue = float.Parse(relevance); relevanceValue = relevanceValue / topicSumWeight; singleTopicContent += term + "," + relevanceValue + ";"; } } if (singleTopicContent != "") { singleTopicContent = singleTopicContent.Remove(singleTopicContent.Length - 1); FileOperators.FileAppend(destFilePath, singleTopicContent); } } }
/* intput: the component that the summary is related with. * compName: the target component that can be a single component or a component\\subcomponent if the target is a subcomponent */ public void GenerateSummary(string compTextPath, string compName, List <string> subcompTerms, string summaryStore) { string compPath = compTextPath + "\\" + compName; List <string> subcompNames = new List <string>(); List <string> candidateSentences = SplitSentences(compPath); //split the paragraph into sentences string summary = ""; if (IsDirectory(compPath))//if the target component is a 'component' { //1. create the subcomponent set. read all of the subcomponents! Here we identify the subcomponent from folders. To the subcomponent that the folders don't contain, they must be not in the critical content of components //2. write all of the related sentences in one document. calculate the scores of sentences in all of the related documents string[] subcomps = Directory.GetFiles(compPath); foreach (string subcomp in subcomps) { string subcompName = FileNameParse.GetFileName(subcomp); subcompNames.Add(subcompName); } } else //if the target component is a 'subcomponent'. Calculate the scores of the sentences in one file { string subcompName = FileNameParse.GetFileName(compPath); subcompNames.Add(subcompName); foreach (string acronym in subcompTerms) { subcompNames.Add(acronym); } } MMRSummary aSummary = new MMRSummary(); summary = aSummary.GenerateSummary(subcompNames, candidateSentences); if (!string.IsNullOrEmpty(summary)) { FileOperators.FileWrite(summaryStore, summary); } else { Console.WriteLine("summary is empty:" + summary + ":" + compName); } }
/*generate and store the tf-idf of each file */ public void calTfidf() { string titleContent = "fileName,"; foreach (string term in tacticTerms) { titleContent += term + ";"; } titleContent = titleContent.Remove(titleContent.Length - 1); FileOperators.FileWrite(this.storePath, titleContent); terms_idf = idf(); //calculate idf string[] fileEntries = Directory.GetFiles(fileDir); updateTfidf(fileEntries); string[] directories = Directory.GetDirectories(fileDir); foreach (string directory in directories) { string[] subFileEntries = Directory.GetFiles(directory); updateTfidf(subFileEntries); } }
private void updateTfidf(string[] fileEntries) { foreach (string fileName in fileEntries) { //string storeFilePath = this.storePath + "/" + fileName; //int slashIndex = fileName.LastIndexOf('\\'); //string filteredFileName = fileName.Substring(0, slashIndex); string tfidfContent = fileName + ";"; foreach (string term in tacticTerms) { int tf_value = tf(term, fileName); double term_idf = terms_idf[term]; double tf_idf = tf_value * term_idf; tfidfContent += tf_idf + ";"; } tfidfContent = tfidfContent.Remove(tfidfContent.Length - 1); FileOperators.FileAppend(this.storePath, tfidfContent); } }
private void WriteRankingResult() { Dictionary <string, Dictionary <string, float> > topicDocDensity = new Dictionary <string, Dictionary <string, float> >(); foreach (string doc in docTopicDensityMap.Keys) //transfer the primaryKey from docId to topicId { Dictionary <string, float> topicDensity = docTopicDensityMap[doc]; foreach (string topicName in topicDensity.Keys) { float density = topicDensity[topicName]; if (topicDocDensity.ContainsKey(topicName)) { topicDocDensity[topicName].Add(doc, density); } else { Dictionary <string, float> docDensity = new Dictionary <string, float>(); docDensity.Add(doc, density); topicDocDensity.Add(topicName, docDensity); } } } foreach (string topicName in topicDocDensity.Keys) { Dictionary <string, float> relevantDocDensity = topicDocDensity[topicName]; Dictionary <string, float> sortedDocDensity = DictionaryDecreasedSort.DecreasedByValue(relevantDocDensity); string docAndDensity = ""; foreach (string key in sortedDocDensity.Keys) { string fileName = key.Substring(filePathLength); float freq = sortedDocDensity[key]; if (freq > 0) { docAndDensity += topicName + "\t" + fileName + "\t" + sortedDocDensity[key] + "\r\n"; } } FileOperators.FileAppend(rankResult, docAndDensity); } }
private void CalDocDensityMap(string[] fileEntities) { foreach (string fileEntity in fileEntities) { int lastSlashIndex = fileEntity.LastIndexOf('\\'); string fileName = fileEntity.Substring(lastSlashIndex + 1); if (!fileNameTopicDensityMap.ContainsKey(fileName)) { string fileContent = FileOperators.ReadFileText(fileEntity).Replace("\n", " ").ToLower(); List <string> fileTermList = new List <string>(fileContent.Split(' ')); Dictionary <string, int> topicDensity = CalWindowDensityInOneFile(fileTermList); //get the topic and the related density for one file fileNameTopicDensityMap.Add(fileName, topicDensity); docTopicDensityMap.Add(fileEntity, topicDensity); } else { Dictionary <string, int> topicDensity = fileNameTopicDensityMap[fileName]; docTopicDensityMap.Add(fileEntity, topicDensity); } } }
public void executeRank() { getAllTopicTerms(); //get topic and the related terms, and do the normalization int txtDirLength = docsPath.Length; foreach (KeyValuePair <string, Dictionary <string, float> > entry in normalizedTopicTerms) { string topicName = entry.Key; //just the topic ID Dictionary <string, float> termAndValues = entry.Value; List <string> terms = new List <string>(termAndValues.Keys); List <float> queryVector = new List <float>(termAndValues.Values); // topicName = topicName.Replace(" ", string.Empty); tfidfStore = docsPath + "-ifidf\\" + topicName + ".csv"; //for each document, generate the ifidf according to the keyterms of topic TFIDF tfidf = new TFIDF(terms, this.docsPath, tfidfStore); tfidf.calTfidf(); string[] tfidfLines = FileOperators.ReadFileLines(tfidfStore); int lineScale = tfidfLines.Length; VSM vsm = new VSM(); string simContent = ""; Dictionary <string, double> docAndRelevance = new Dictionary <string, double>(); for (int i = 1; i < lineScale; i++) { string curLine = tfidfLines[i]; int firstComma = curLine.IndexOf(';'); string fileName = curLine.Substring(0, firstComma); //test if the length is right string valueStr = curLine.Substring(firstComma + 1); string[] valueTerms = valueStr.Split(';'); List <float> docVector = new List <float>(); foreach (string valueTerm in valueTerms) { float value = float.Parse(valueTerm); docVector.Add(value); } double sim = vsm.calSimilarity(docVector, queryVector); if (sim > 0) { docAndRelevance.Add(fileName, sim); //get the similarity between doc and topic } } //execute decrease sorting on the docAndRelevance Dictionary <string, double> sortedByRelevance = DictionaryDecreasedSort.DecreasedByValue(docAndRelevance); foreach (string key in sortedByRelevance.Keys) { double similarity = sortedByRelevance[key]; string fileName = key.Substring(txtDirLength); simContent += topicName + "\t" + fileName + "\t" + similarity + "\r\n"; } FileOperators.FileAppend(simStorePath, simContent); //simStorePath should contain relativePath } Console.WriteLine("DONE!!"); }
/* * oriPDFPath: the pdf documents are stored in a directory */ public void CountDocRelativeSeq(string resultFile, string oriPDFPath) { //parse component and the related files Dictionary <string, List <string> > compFiles = new Dictionary <string, List <string> >(); foreach (string compDir in Directory.GetDirectories(oriPDFPath)) { string compName = FileNameParse.GetFileName(compDir); List <string> fileList = new List <string>(); foreach (string filePath in Directory.GetFiles(compDir)) { string fileName = FileNameParse.GetFileName(filePath); fileList.Add(fileName); } compFiles.Add(compName, fileList); } //read the result file string resultFileContent = FileOperators.ReadFileText(resultFile); string[] separators = new string[] { "\r\n\r\n" }; string[] compChunks = resultFileContent.Split(separators, StringSplitOptions.RemoveEmptyEntries); foreach (string compChunk in compChunks) { string[] lineSeparator = new string[] { "\r\n" }; string[] lines = compChunk.Split(lineSeparator, StringSplitOptions.RemoveEmptyEntries); string firstLine = lines[0]; if (firstLine.Contains("\t")) { string[] lineTerms = firstLine.Split('\t'); string compName = lineTerms[0]; if (compName.Contains(",")) { int commaIndex = compName.IndexOf(','); string[] candidateComps = compName.Split(','); int candidateScale = candidateComps.Count(); bool matched = false; int candidateIndex = 0; while (!matched && candidateIndex < candidateScale) { string candidate = candidateComps[candidateIndex].Trim(); foreach (string fileComp in compFiles.Keys) { if (candidate.Equals(fileComp.ToLower())) { List <string> targetFiles = compFiles[fileComp]; ReadResultFile(fileComp, lines, targetFiles); matched = true; break; } } candidateIndex++; } } else { foreach (string fileComp in compFiles.Keys) { if (fileComp.Equals(compName)) { List <string> targetFiles = compFiles[fileComp]; ReadResultFile(fileComp, lines, targetFiles); break; } } } } } }