public static void ExecuteExtraction(string sourceFileName, string destFileName) { Application _application = new Application(); Workbook oriWorkBook = _application.Workbooks.Open(sourceFileName); int sheetNum = oriWorkBook.Worksheets.Count; for (int i = 1; i <= sheetNum; i++) { string fileContent = ""; Worksheet xlWorkSheet = oriWorkBook.Worksheets[i]; // Detect Last used Row - Ignore cells that contains formulas that result in blank values int lastRowIgnoreFormulas = xlWorkSheet.Cells.Find( "*", System.Reflection.Missing.Value, Microsoft.Office.Interop.Excel.XlFindLookIn.xlValues, Microsoft.Office.Interop.Excel.XlLookAt.xlWhole, Microsoft.Office.Interop.Excel.XlSearchOrder.xlByRows, Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious, false, System.Reflection.Missing.Value, System.Reflection.Missing.Value).Row; // Detect Last Used Column - Ignore cells that contains formulas that result in blank values int lastColIgnoreFormulas = xlWorkSheet.Cells.Find( "*", System.Reflection.Missing.Value, System.Reflection.Missing.Value, System.Reflection.Missing.Value, Microsoft.Office.Interop.Excel.XlSearchOrder.xlByColumns, Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious, false, System.Reflection.Missing.Value, System.Reflection.Missing.Value).Column; // Detect Last used Row / Column - Including cells that contains formulas that result in blank values for (int j = 1; j <= lastRowIgnoreFormulas; j++) { for (int k = 1; k <= lastColIgnoreFormulas; k++) { Range usedRange = xlWorkSheet.Cells[j, k]; string rangeText = usedRange.Text; fileContent += rangeText + " "; } } FileOperators.FileAppend(destFileName, fileContent); } oriWorkBook.Close(XlSaveAction.xlDoNotSaveChanges); _application.Application.Quit(); }
private void WriteDomainTerms(Dictionary <string, List <string> > componentTerms) { string domainTermContent = ""; //write component level info foreach (string component in componentTerms.Keys) { List <string> terms = componentTerms[component]; if (terms.Count == 0) { continue; } //float termCount = terms.Count; //float prop = 1/termCount; domainTermContent += component + ":"; if (component.Contains(",")) { string[] componentNames = component.Split(','); foreach (string name in componentNames) { domainTermContent += name + "," + 1.0 + ";"; } } else { domainTermContent += component + "," + 1.0 + ";"; } foreach (string term in terms) { domainTermContent += term + "," + 0.2 + ";"; } if (domainTermContent.EndsWith(";")) { domainTermContent = domainTermContent.Remove(domainTermContent.Length - 1).ToLower() + "\r\n"; } else { domainTermContent += "\r\n"; } } if (domainTermContent.EndsWith("\r\n")) { domainTermContent = domainTermContent.Remove(domainTermContent.Length - 2).ToLower(); } FileOperators.FileAppend(fixedFormatPath, domainTermContent); }
public static void ExecuteExtraction(string sourceFileName, string destFileName, BackgroundWorker backgroundWorker = null) { PdfReader reader = new PdfReader(sourceFileName); int pdfNum = reader.NumberOfPages; for (int i = 1; i <= pdfNum; i++) { var textPos = new MyLocationExtractionStrategy(); //Create an instance of our strategy string ex = PdfTextExtractor.GetTextFromPage(reader, i, textPos); //store the text and the position info in textPos FileOperators.FileAppend(destFileName, ex); } }
/*parse the summary.txt file of the standford tmt results */ public static void Execute(string sourceFilePath, string destFilePath) { if (File.Exists(destFilePath)) { File.Delete(destFilePath); } string topicContent = FileOperators.ReadFileText(sourceFilePath); string[] stringSeparators = new string[] { "\r\n\n\r\n" }; string[] topicParts = topicContent.Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries); foreach (string topicPart in topicParts) { string[] topicPartSeparators = new string[] { "\r\n" }; string[] lines = topicPart.Split(topicPartSeparators, StringSplitOptions.RemoveEmptyEntries); string singleTopicContent = ""; float topicSumWeight = 0f; foreach (string line in lines) { if (line.StartsWith("Topic")) { int spaceIndex = line.IndexOf("\t\t"); string topicId = line.Substring(0, spaceIndex); string topicWeight = line.Substring(spaceIndex + 2); topicSumWeight = float.Parse(topicWeight); singleTopicContent += topicId + ":"; } else { string trimedLine = line.Trim(); string[] terms = trimedLine.Split('\t'); string term = terms[0]; string relevance = terms[1]; float relevanceValue = float.Parse(relevance); relevanceValue = relevanceValue / topicSumWeight; singleTopicContent += term + "," + relevanceValue + ";"; } } if (singleTopicContent != "") { singleTopicContent = singleTopicContent.Remove(singleTopicContent.Length - 1); FileOperators.FileAppend(destFilePath, singleTopicContent); } } }
private void updateTfidf(string[] fileEntries) { foreach (string fileName in fileEntries) { //string storeFilePath = this.storePath + "/" + fileName; //int slashIndex = fileName.LastIndexOf('\\'); //string filteredFileName = fileName.Substring(0, slashIndex); string tfidfContent = fileName + ";"; foreach (string term in tacticTerms) { int tf_value = tf(term, fileName); double term_idf = terms_idf[term]; double tf_idf = tf_value * term_idf; tfidfContent += tf_idf + ";"; } tfidfContent = tfidfContent.Remove(tfidfContent.Length - 1); FileOperators.FileAppend(this.storePath, tfidfContent); } }
private void WriteRankingResult() { Dictionary <string, Dictionary <string, float> > topicDocDensity = new Dictionary <string, Dictionary <string, float> >(); foreach (string doc in docTopicDensityMap.Keys) //transfer the primaryKey from docId to topicId { Dictionary <string, float> topicDensity = docTopicDensityMap[doc]; foreach (string topicName in topicDensity.Keys) { float density = topicDensity[topicName]; if (topicDocDensity.ContainsKey(topicName)) { topicDocDensity[topicName].Add(doc, density); } else { Dictionary <string, float> docDensity = new Dictionary <string, float>(); docDensity.Add(doc, density); topicDocDensity.Add(topicName, docDensity); } } } foreach (string topicName in topicDocDensity.Keys) { Dictionary <string, float> relevantDocDensity = topicDocDensity[topicName]; Dictionary <string, float> sortedDocDensity = DictionaryDecreasedSort.DecreasedByValue(relevantDocDensity); string docAndDensity = ""; foreach (string key in sortedDocDensity.Keys) { string fileName = key.Substring(filePathLength); float freq = sortedDocDensity[key]; if (freq > 0) { docAndDensity += topicName + "\t" + fileName + "\t" + sortedDocDensity[key] + "\r\n"; } } FileOperators.FileAppend(rankResult, docAndDensity); } }
public void executeRank() { getAllTopicTerms(); //get topic and the related terms, and do the normalization int txtDirLength = docsPath.Length; foreach (KeyValuePair <string, Dictionary <string, float> > entry in normalizedTopicTerms) { string topicName = entry.Key; //just the topic ID Dictionary <string, float> termAndValues = entry.Value; List <string> terms = new List <string>(termAndValues.Keys); List <float> queryVector = new List <float>(termAndValues.Values); // topicName = topicName.Replace(" ", string.Empty); tfidfStore = docsPath + "-ifidf\\" + topicName + ".csv"; //for each document, generate the ifidf according to the keyterms of topic TFIDF tfidf = new TFIDF(terms, this.docsPath, tfidfStore); tfidf.calTfidf(); string[] tfidfLines = FileOperators.ReadFileLines(tfidfStore); int lineScale = tfidfLines.Length; VSM vsm = new VSM(); string simContent = ""; Dictionary <string, double> docAndRelevance = new Dictionary <string, double>(); for (int i = 1; i < lineScale; i++) { string curLine = tfidfLines[i]; int firstComma = curLine.IndexOf(';'); string fileName = curLine.Substring(0, firstComma); //test if the length is right string valueStr = curLine.Substring(firstComma + 1); string[] valueTerms = valueStr.Split(';'); List <float> docVector = new List <float>(); foreach (string valueTerm in valueTerms) { float value = float.Parse(valueTerm); docVector.Add(value); } double sim = vsm.calSimilarity(docVector, queryVector); if (sim > 0) { docAndRelevance.Add(fileName, sim); //get the similarity between doc and topic } } //execute decrease sorting on the docAndRelevance Dictionary <string, double> sortedByRelevance = DictionaryDecreasedSort.DecreasedByValue(docAndRelevance); foreach (string key in sortedByRelevance.Keys) { double similarity = sortedByRelevance[key]; string fileName = key.Substring(txtDirLength); simContent += topicName + "\t" + fileName + "\t" + similarity + "\r\n"; } FileOperators.FileAppend(simStorePath, simContent); //simStorePath should contain relativePath } Console.WriteLine("DONE!!"); }