Example #1
0
        public static void ExecuteExtraction(string sourceFileName, string destFileName)
        {
            Application _application = new Application();
            Workbook    oriWorkBook  = _application.Workbooks.Open(sourceFileName);

            int sheetNum = oriWorkBook.Worksheets.Count;

            for (int i = 1; i <= sheetNum; i++)
            {
                string    fileContent = "";
                Worksheet xlWorkSheet = oriWorkBook.Worksheets[i];

                // Detect Last used Row - Ignore cells that contains formulas that result in blank values
                int lastRowIgnoreFormulas = xlWorkSheet.Cells.Find(
                    "*",
                    System.Reflection.Missing.Value,
                    Microsoft.Office.Interop.Excel.XlFindLookIn.xlValues,
                    Microsoft.Office.Interop.Excel.XlLookAt.xlWhole,
                    Microsoft.Office.Interop.Excel.XlSearchOrder.xlByRows,
                    Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious,
                    false,
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value).Row;
                // Detect Last Used Column  - Ignore cells that contains formulas that result in blank values
                int lastColIgnoreFormulas = xlWorkSheet.Cells.Find(
                    "*",
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value,
                    Microsoft.Office.Interop.Excel.XlSearchOrder.xlByColumns,
                    Microsoft.Office.Interop.Excel.XlSearchDirection.xlPrevious,
                    false,
                    System.Reflection.Missing.Value,
                    System.Reflection.Missing.Value).Column;

                // Detect Last used Row / Column - Including cells that contains formulas that result in blank values
                for (int j = 1; j <= lastRowIgnoreFormulas; j++)
                {
                    for (int k = 1; k <= lastColIgnoreFormulas; k++)
                    {
                        Range  usedRange = xlWorkSheet.Cells[j, k];
                        string rangeText = usedRange.Text;
                        fileContent += rangeText + " ";
                    }
                }

                FileOperators.FileAppend(destFileName, fileContent);
            }

            oriWorkBook.Close(XlSaveAction.xlDoNotSaveChanges);
            _application.Application.Quit();
        }
Example #2
0
        private void WriteDomainTerms(Dictionary <string, List <string> > componentTerms)
        {
            string domainTermContent = "";

            //write component level info
            foreach (string component in componentTerms.Keys)
            {
                List <string> terms = componentTerms[component];
                if (terms.Count == 0)
                {
                    continue;
                }
                //float termCount = terms.Count;
                //float prop = 1/termCount;
                domainTermContent += component + ":";
                if (component.Contains(","))
                {
                    string[] componentNames = component.Split(',');
                    foreach (string name in componentNames)
                    {
                        domainTermContent += name + "," + 1.0 + ";";
                    }
                }
                else
                {
                    domainTermContent += component + "," + 1.0 + ";";
                }

                foreach (string term in terms)
                {
                    domainTermContent += term + "," + 0.2 + ";";
                }
                if (domainTermContent.EndsWith(";"))
                {
                    domainTermContent = domainTermContent.Remove(domainTermContent.Length - 1).ToLower() + "\r\n";
                }
                else
                {
                    domainTermContent += "\r\n";
                }
            }

            if (domainTermContent.EndsWith("\r\n"))
            {
                domainTermContent = domainTermContent.Remove(domainTermContent.Length - 2).ToLower();
            }

            FileOperators.FileAppend(fixedFormatPath, domainTermContent);
        }
Example #3
0
        public static void ExecuteExtraction(string sourceFileName, string destFileName, BackgroundWorker backgroundWorker = null)
        {
            PdfReader reader = new PdfReader(sourceFileName);

            int pdfNum = reader.NumberOfPages;

            for (int i = 1; i <= pdfNum; i++)
            {
                var textPos = new MyLocationExtractionStrategy();

                //Create an instance of our strategy
                string ex = PdfTextExtractor.GetTextFromPage(reader, i, textPos); //store the text and the position info in textPos
                FileOperators.FileAppend(destFileName, ex);
            }
        }
Example #4
0
        /*parse the summary.txt file of the standford tmt results
         */
        public static void Execute(string sourceFilePath, string destFilePath)
        {
            if (File.Exists(destFilePath))
            {
                File.Delete(destFilePath);
            }

            string topicContent = FileOperators.ReadFileText(sourceFilePath);

            string[] stringSeparators = new string[] { "\r\n\n\r\n" };
            string[] topicParts       = topicContent.Split(stringSeparators, StringSplitOptions.RemoveEmptyEntries);
            foreach (string topicPart in topicParts)
            {
                string[] topicPartSeparators = new string[] { "\r\n" };
                string[] lines = topicPart.Split(topicPartSeparators, StringSplitOptions.RemoveEmptyEntries);
                string   singleTopicContent = "";
                float    topicSumWeight     = 0f;
                foreach (string line in lines)
                {
                    if (line.StartsWith("Topic"))
                    {
                        int    spaceIndex  = line.IndexOf("\t\t");
                        string topicId     = line.Substring(0, spaceIndex);
                        string topicWeight = line.Substring(spaceIndex + 2);
                        topicSumWeight      = float.Parse(topicWeight);
                        singleTopicContent += topicId + ":";
                    }
                    else
                    {
                        string   trimedLine     = line.Trim();
                        string[] terms          = trimedLine.Split('\t');
                        string   term           = terms[0];
                        string   relevance      = terms[1];
                        float    relevanceValue = float.Parse(relevance);
                        relevanceValue      = relevanceValue / topicSumWeight;
                        singleTopicContent += term + "," + relevanceValue + ";";
                    }
                }

                if (singleTopicContent != "")
                {
                    singleTopicContent = singleTopicContent.Remove(singleTopicContent.Length - 1);
                    FileOperators.FileAppend(destFilePath, singleTopicContent);
                }
            }
        }
Example #5
0
        private void updateTfidf(string[] fileEntries)
        {
            foreach (string fileName in fileEntries)
            {
                //string storeFilePath = this.storePath + "/" + fileName;
                //int slashIndex = fileName.LastIndexOf('\\');
                //string filteredFileName = fileName.Substring(0, slashIndex);

                string tfidfContent = fileName + ";";
                foreach (string term in tacticTerms)
                {
                    int    tf_value = tf(term, fileName);
                    double term_idf = terms_idf[term];
                    double tf_idf   = tf_value * term_idf;
                    tfidfContent += tf_idf + ";";
                }
                tfidfContent = tfidfContent.Remove(tfidfContent.Length - 1);

                FileOperators.FileAppend(this.storePath, tfidfContent);
            }
        }
Example #6
0
        private void WriteRankingResult()
        {
            Dictionary <string, Dictionary <string, float> > topicDocDensity = new Dictionary <string, Dictionary <string, float> >();

            foreach (string doc in docTopicDensityMap.Keys)  //transfer the primaryKey from docId to topicId
            {
                Dictionary <string, float> topicDensity = docTopicDensityMap[doc];
                foreach (string topicName in topicDensity.Keys)
                {
                    float density = topicDensity[topicName];
                    if (topicDocDensity.ContainsKey(topicName))
                    {
                        topicDocDensity[topicName].Add(doc, density);
                    }
                    else
                    {
                        Dictionary <string, float> docDensity = new Dictionary <string, float>();
                        docDensity.Add(doc, density);
                        topicDocDensity.Add(topicName, docDensity);
                    }
                }
            }

            foreach (string topicName in topicDocDensity.Keys)
            {
                Dictionary <string, float> relevantDocDensity = topicDocDensity[topicName];
                Dictionary <string, float> sortedDocDensity   = DictionaryDecreasedSort.DecreasedByValue(relevantDocDensity);
                string docAndDensity = "";
                foreach (string key in sortedDocDensity.Keys)
                {
                    string fileName = key.Substring(filePathLength);
                    float  freq     = sortedDocDensity[key];
                    if (freq > 0)
                    {
                        docAndDensity += topicName + "\t" + fileName + "\t" + sortedDocDensity[key] + "\r\n";
                    }
                }
                FileOperators.FileAppend(rankResult, docAndDensity);
            }
        }
Example #7
0
        public void executeRank()
        {
            getAllTopicTerms(); //get topic and the related terms, and do the normalization

            int txtDirLength = docsPath.Length;

            foreach (KeyValuePair <string, Dictionary <string, float> > entry in normalizedTopicTerms)
            {
                string topicName = entry.Key; //just the topic ID


                Dictionary <string, float> termAndValues = entry.Value;

                List <string> terms = new List <string>(termAndValues.Keys);

                List <float> queryVector = new List <float>(termAndValues.Values);

                // topicName = topicName.Replace(" ", string.Empty);

                tfidfStore = docsPath + "-ifidf\\" + topicName + ".csv";
                //for each document, generate the ifidf according to the keyterms of topic
                TFIDF tfidf = new TFIDF(terms, this.docsPath, tfidfStore);
                tfidf.calTfidf();

                string[] tfidfLines = FileOperators.ReadFileLines(tfidfStore);

                int lineScale = tfidfLines.Length;

                VSM vsm = new VSM();

                string simContent = "";

                Dictionary <string, double> docAndRelevance = new Dictionary <string, double>();

                for (int i = 1; i < lineScale; i++)
                {
                    string curLine    = tfidfLines[i];
                    int    firstComma = curLine.IndexOf(';');
                    string fileName   = curLine.Substring(0, firstComma); //test if the length is right

                    string       valueStr   = curLine.Substring(firstComma + 1);
                    string[]     valueTerms = valueStr.Split(';');
                    List <float> docVector  = new List <float>();
                    foreach (string valueTerm in valueTerms)
                    {
                        float value = float.Parse(valueTerm);
                        docVector.Add(value);
                    }
                    double sim = vsm.calSimilarity(docVector, queryVector);
                    if (sim > 0)
                    {
                        docAndRelevance.Add(fileName, sim); //get the similarity between doc and topic
                    }
                }

                //execute decrease sorting on the docAndRelevance
                Dictionary <string, double> sortedByRelevance = DictionaryDecreasedSort.DecreasedByValue(docAndRelevance);
                foreach (string key in sortedByRelevance.Keys)
                {
                    double similarity = sortedByRelevance[key];
                    string fileName   = key.Substring(txtDirLength);
                    simContent += topicName + "\t" + fileName + "\t" + similarity + "\r\n";
                }
                FileOperators.FileAppend(simStorePath, simContent); //simStorePath should contain relativePath
            }
            Console.WriteLine("DONE!!");
        }