Example #1
0
 /* Parse the terms and their weights from the TopicManualTerms.txt
  */
 private void ParseCompTerms(string searchTerms)
 {
     string[] compLines = FileOperators.ReadFileLines(searchTerms);
     foreach (string aLine in compLines)
     {
         if (aLine.Contains(":"))
         {
             int      commaIndex  = aLine.IndexOf(":");
             string   compName    = aLine.Substring(0, commaIndex);
             string   termParts   = aLine.Substring(commaIndex + 1);
             string[] termWeights = termParts.Split(';');
             Dictionary <string, float> termWeightDic = new Dictionary <string, float>();
             foreach (string termWeight in termWeights)
             {
                 if (termWeight.Contains(","))
                 {
                     string[] tmpTerms  = termWeight.Split(',');
                     string   term      = tmpTerms[0];
                     string   weightStr = tmpTerms[1];
                     float    weight    = float.Parse(weightStr);
                     termWeightDic.Add(term, weight);
                 }
                 else
                 {
                     continue;
                 }
             }
             compTerms.Add(compName, termWeightDic);
         }
         else
         {
             continue;
         }
     }
 }
Example #2
0
        public static List <string> ParseTopicTerms(string topicTermPath, string targetTopicName)
        {
            List <string> topicTerms = new List <string>();

            string[] topicTermLines = FileOperators.ReadFileLines(topicTermPath);
            foreach (string line in topicTermLines)
            {
                if (line.StartsWith(targetTopicName))
                {
                    int    colonIndex = line.IndexOf(':');
                    string termValues = line.Substring(colonIndex + 1);
                    if (termValues.Contains(";"))
                    {
                        string[] termValueList = termValues.Split(';');
                        foreach (string termValuePair in termValueList)
                        {
                            int    commaIndex = termValuePair.IndexOf(',');
                            string term       = termValuePair.Substring(0, commaIndex).ToLower().Trim();
                            topicTerms.Add(term);
                        }
                    }
                    else
                    {
                        int    commaIndex = termValues.IndexOf(',');
                        string term       = termValues.Substring(0, commaIndex).ToLower().Trim();
                        topicTerms.Add(term);
                    }
                }
            }
            return(topicTerms);
        }
Example #3
0
        /*update here in future. We didnot consider the frequency now!
         */
        private List <string> FilterNGram()
        {
            List <string> ngrams = new List <string>();

            string[] ngramLines = FileOperators.ReadFileLines(n_gramFile);

            foreach (string ngram in ngramLines)
            {
                int spaceIndex = ngram.IndexOf(" ");
                if (spaceIndex > -1)
                {
                    string   textFreqStr = ngram.Substring(0, spaceIndex);
                    string[] separators  = { "<>" };
                    string[] terms       = textFreqStr.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                    if (terms.Length == 3)
                    {
                        string first   = terms[0];
                        string second  = terms[1];
                        string freqStr = terms[2];
                        if (first.Length > 2 && second.Length > 2)
                        {
                            float freq = float.Parse(freqStr);
                            ngrams.Add(first + "<>" + second);
                        }
                    }
                    else
                    {
                        Console.WriteLine("check!");
                    }
                }
            }
            return(ngrams);
        }
Example #4
0
 /* store the stemmed file as the original hierarchy
  */
 public void FileDirStemming(string sourceFile, string stemmedPath)
 {
     if (IsDirectory(sourceFile))
     {
         string[] subDirs = Directory.GetDirectories(sourceFile);
         foreach (string subDir in subDirs)
         {
             string subDirName = FileNameParse.GetFileName(subDir);
             string storePath  = stemmedPath + "\\" + subDirName;
             FileDirStemming(subDir, storePath);
         }
         string[] subFiles = Directory.GetFiles(sourceFile);
         foreach (string subFile in subFiles)
         {
             string[] fileLines      = FileOperators.ReadFileLines(subFile);
             string   fileName       = FileNameParse.GetFileName(subFile);
             string   stemmedContent = "";
             foreach (string fileLine in fileLines)
             {
                 string   stemmedLine = "";
                 string[] separators  = { " ", "," };
                 string[] terms       = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 Porter2  porter      = new Porter2();
                 foreach (string term in terms)
                 {
                     string stemmedTerm = porter.stem(term);
                     stemmedLine += stemmedTerm + " ";
                 }
                 stemmedContent += stemmedLine + "\r\n";
             }
             FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent);
         }
     }
 }
Example #5
0
        private static Dictionary <string, Dictionary <string, double> > ParseTopicTerms(string topicTermsFileName)
        {
            Dictionary <string, Dictionary <string, double> > topicTermPropMap = new Dictionary <string, Dictionary <string, double> >();

            string[] topicTermLines = FileOperators.ReadFileLines(topicTermsFileName);
            for (int index = 0; index < topicTermLines.Length; index++)
            {
                string line       = topicTermLines[index];
                int    colonIndex = line.IndexOf(':');
                string topicName  = line.Substring(0, colonIndex);
                string termValues = line.Substring(colonIndex + 1);
                Dictionary <string, double> termProps = new Dictionary <string, double>();
                if (termValues.Contains(";"))
                {
                    string[] termValueList = termValues.Split(';');
                    foreach (string termValuePair in termValueList)
                    {
                        int    commaIndex = termValuePair.IndexOf(',');
                        string term       = termValuePair.Substring(0, commaIndex);
                        string propStr    = termValuePair.Substring(commaIndex + 1);
                        termProps.Add(term, double.Parse(propStr));
                    }
                }

                if (!topicTermPropMap.ContainsKey(topicName))
                {
                    topicTermPropMap.Add(topicName, termProps);
                }
            }

            return(topicTermPropMap);
        }
Example #6
0
        private static Dictionary <string, List <string> > ParseTopicTerms(string topicTermFilePath)
        {
            Dictionary <string, List <string> > topicTerms = new Dictionary <string, List <string> >();

            string[] topicTermLines = FileOperators.ReadFileLines(topicTermFilePath);
            foreach (string line in topicTermLines)
            {
                int           colonIndex = line.IndexOf(':');
                string        topicID    = line.Substring(0, colonIndex);
                string        termValues = line.Substring(colonIndex + 1);
                List <string> terms      = new List <string>();
                if (termValues.Contains(";"))
                {
                    string[] termValueList = termValues.Split(';');
                    foreach (string termValuePair in termValueList)
                    {
                        int    commaIndex = termValuePair.IndexOf(',');
                        string term       = termValuePair.Substring(0, commaIndex);
                        terms.Add(term);
                    }
                }

                topicTerms.Add(topicID, terms);
            }

            return(topicTerms);
        }
Example #7
0
 private void AddUserSearchTerms(List<string> topicTerms)
 {
     string extraSearchTermPath = Configures.GetManualSearchTermPath();
     if (!File.Exists(extraSearchTermPath))
     {
         return;
     }
     else
     {
         string[] lines = FileOperators.ReadFileLines(extraSearchTermPath);
         foreach (string line in lines)
         {
             if (line.Contains(":"))
             {
                 int commaIndex = line.IndexOf(":");
                 string compName = line.Substring(0, commaIndex);
                 if (compName.Equals(targetTopicName))
                 {
                     string extraTermStr = line.Substring(commaIndex + 1);
                     string[] extraTerms = extraTermStr.Split(',');
                     foreach (string extraTerm in extraTerms)
                     {
                         string trimmedTerm = extraTerm.Trim();
                         if (!topicTerms.Contains(trimmedTerm))
                         {
                             topicTerms.Add(trimmedTerm);
                         }
                     }
                 }
             }
         }
     }
 }
Example #8
0
        public static Dictionary <string, Dictionary <string, float> > ParseTopicTerms(string topicTermsPath)
        {
            Dictionary <string, Dictionary <string, float> > topicTerms = new Dictionary <string, Dictionary <string, float> >();

            string[] topicLines = FileOperators.ReadFileLines(topicTermsPath);
            foreach (string line in topicLines)
            {
                if (line.Length == 0)
                {
                    continue;
                }

                int    colonIndex = line.IndexOf(":");
                string topicID    = line.Substring(0, colonIndex);
                Dictionary <string, float> termValueMap = new Dictionary <string, float>();

                string   termValuePart = line.Substring(colonIndex + 1);
                string[] terms         = termValuePart.Split(';');
                foreach (string termValue in terms)
                {
                    string[] termAndValue = termValue.Split(',');
                    string   term         = termAndValue[0].Trim().ToLower();
                    string   value        = termAndValue[1];
                    if (!termValueMap.ContainsKey(term))
                    {
                        termValueMap.Add(term, float.Parse(value));
                    }
                }
                topicTerms.Add(topicID, termValueMap);
            }
            return(topicTerms);
        }
Example #9
0
        private Dictionary <string, string> mapTopicIDName()
        {
            Dictionary <string, string> IDNameMap = new Dictionary <string, string>();

            if (this.topicNamePath.Length > 0)
            {
                string[] fileContent = FileOperators.ReadFileLines(this.topicNamePath);
                foreach (string line in fileContent)
                {
                    string[] terms     = line.Split(':');
                    string   topicID   = terms[0];
                    string   topicName = terms[1];
                    IDNameMap.Add(topicID, topicName);
                }
            }
            return(IDNameMap);
        }
Example #10
0
        private void getAllTopicTerms()
        {
            Dictionary <string, Dictionary <string, string> > topicTerms = new Dictionary <string, Dictionary <string, string> >();

            string[] topicLines = FileOperators.ReadFileLines(topicTermsPath);
            Dictionary <string, string> topicIDNameMap = mapTopicIDName();

            foreach (string line in topicLines)
            {
                if (line.Length == 0)
                {
                    continue;
                }

                int    colonIndex = line.IndexOf(":");
                string topicID    = line.Substring(0, colonIndex);
                string topicName;
                if (topicIDNameMap.ContainsKey(topicID))
                {
                    topicName = topicIDNameMap[topicID];
                }
                else
                {
                    topicName = topicID;
                }
                Dictionary <string, string> termValueMap = new Dictionary <string, string>();

                string   termValuePart = line.Substring(colonIndex + 1);
                string[] terms         = termValuePart.Split(';');
                foreach (string termValue in terms)
                {
                    string[] termAndValue = termValue.Split(',');
                    string   term         = termAndValue[0].Trim().ToLower();
                    string   value        = termAndValue[1];
                    if (!termValueMap.ContainsKey(term))
                    {
                        termValueMap.Add(term, value);
                    }
                }
                topicTerms.Add(topicName, termValueMap);
            }

            NormalizeTopicRelevance normalizer = new NormalizeTopicRelevance();

            normalizedTopicTerms = normalizer.DoNormalize(topicTerms);
        }
Example #11
0
        private void CleanSingleFile(string sourceFilePath, string destFileName)
        {
            string fileName = FileNameParse.GetFileName(sourceFilePath);

            string[]      oriLines       = FileOperators.ReadFileLines(sourceFilePath);
            List <string> filterDecorate = DeleteDecorate(oriLines);//contents, list of figures, list of tables, appendix

            //SplitParagraph(filterDecorate);

            List <string> cleanedText   = DetailClean(filterDecorate);
            List <string> furtherClean  = CleanMoreSpace(cleanedText);//filter the more consecutive empty space in each line
            List <string> mergedContent = MergeContent(furtherClean);

            string filteredContent = String.Join("\r\n", mergedContent);

            FileOperators.FileWrite(destFileName, filteredContent);
        }
Example #12
0
        public void executeRank()
        {
            getAllTopicTerms(); //get topic and the related terms, and do the normalization

            int txtDirLength = docsPath.Length;

            foreach (KeyValuePair <string, Dictionary <string, float> > entry in normalizedTopicTerms)
            {
                string topicName = entry.Key; //just the topic ID


                Dictionary <string, float> termAndValues = entry.Value;

                List <string> terms = new List <string>(termAndValues.Keys);

                List <float> queryVector = new List <float>(termAndValues.Values);

                // topicName = topicName.Replace(" ", string.Empty);

                tfidfStore = docsPath + "-ifidf\\" + topicName + ".csv";
                //for each document, generate the ifidf according to the keyterms of topic
                TFIDF tfidf = new TFIDF(terms, this.docsPath, tfidfStore);
                tfidf.calTfidf();

                string[] tfidfLines = FileOperators.ReadFileLines(tfidfStore);

                int lineScale = tfidfLines.Length;

                VSM vsm = new VSM();

                string simContent = "";

                Dictionary <string, double> docAndRelevance = new Dictionary <string, double>();

                for (int i = 1; i < lineScale; i++)
                {
                    string curLine    = tfidfLines[i];
                    int    firstComma = curLine.IndexOf(';');
                    string fileName   = curLine.Substring(0, firstComma); //test if the length is right

                    string       valueStr   = curLine.Substring(firstComma + 1);
                    string[]     valueTerms = valueStr.Split(';');
                    List <float> docVector  = new List <float>();
                    foreach (string valueTerm in valueTerms)
                    {
                        float value = float.Parse(valueTerm);
                        docVector.Add(value);
                    }
                    double sim = vsm.calSimilarity(docVector, queryVector);
                    if (sim > 0)
                    {
                        docAndRelevance.Add(fileName, sim); //get the similarity between doc and topic
                    }
                }

                //execute decrease sorting on the docAndRelevance
                Dictionary <string, double> sortedByRelevance = DictionaryDecreasedSort.DecreasedByValue(docAndRelevance);
                foreach (string key in sortedByRelevance.Keys)
                {
                    double similarity = sortedByRelevance[key];
                    string fileName   = key.Substring(txtDirLength);
                    simContent += topicName + "\t" + fileName + "\t" + similarity + "\r\n";
                }
                FileOperators.FileAppend(simStorePath, simContent); //simStorePath should contain relativePath
            }
            Console.WriteLine("DONE!!");
        }
Example #13
0
        public Dictionary <string, Dictionary <string, float> > GetCombinedTermsWeights(string termPath, string expanedTermPath)
        {
            Dictionary <string, Dictionary <string, float> > compTerms = new Dictionary <string, Dictionary <string, float> >();

            string[] termLines = FileOperators.ReadFileLines(termPath);
            foreach (string line in termLines)
            {
                int colonIndex = line.IndexOf(":");
                if (colonIndex > 0)
                {
                    string compName = line.Substring(0, colonIndex);
                    Dictionary <string, float> terms = new Dictionary <string, float>();
                    string   termStr   = line.Substring(colonIndex + 1);
                    string[] termPairs = termStr.Split(';');
                    foreach (string pair in termPairs)
                    {
                        string[] termWeight = pair.Split(',');
                        if (termWeight.Count() == 2)
                        {
                            string term      = termWeight[0].Trim();
                            string weightStr = termWeight[1];
                            float  weight    = float.Parse(weightStr);
                            if (!terms.ContainsKey(term))
                            {
                                terms.Add(term, weight);
                            }
                        }
                    }
                    compTerms.Add(compName, terms);
                }
            }
            string[] expandedLines = FileOperators.ReadFileLines(expanedTermPath);
            foreach (string expandedLine in expandedLines)
            {
                int colonIndex = expandedLine.IndexOf(':');
                if (colonIndex > 0)
                {
                    string compName = expandedLine.Substring(0, colonIndex);
                    if (compTerms.ContainsKey(compName))
                    {
                        Dictionary <string, float> existedTerms = compTerms[compName];
                        string   termStr = expandedLine.Substring(colonIndex + 1);
                        string[] terms   = termStr.Split(',');
                        foreach (string term in terms)
                        {
                            if (!existedTerms.ContainsKey(term.Trim()))
                            {
                                existedTerms.Add(term.Trim(), 0.2f);
                            }
                        }
                        compTerms[compName] = existedTerms;
                    }
                    else
                    {
                        Console.WriteLine("Please check the component:" + compName);
                    }
                }
            }

            return(compTerms);
        }
Example #14
0
        public Dictionary <string, List <string> > GetCombinedTerms(string termPath, string expanedTermPath)
        {
            Dictionary <string, List <string> > compTerms = new Dictionary <string, List <string> >();

            if (!string.IsNullOrEmpty(termPath))
            {
                string[] termLines = FileOperators.ReadFileLines(termPath);
                if (termLines == null)
                {
                    return(null);
                }

                foreach (string line in termLines)
                {
                    int colonIndex = line.IndexOf(":");
                    if (colonIndex > 0)
                    {
                        string        compName  = line.Substring(0, colonIndex);
                        List <string> terms     = new List <string>();
                        string        termStr   = line.Substring(colonIndex + 1);
                        string[]      termPairs = termStr.Split(';');
                        foreach (string pair in termPairs)
                        {
                            int commaIndex = pair.IndexOf(',');
                            if (commaIndex > 0)
                            {
                                string term = pair.Substring(0, commaIndex).Trim();
                                terms.Add(term);
                            }
                        }
                        compTerms.Add(compName, terms);
                    }
                }
            }

            if (!string.IsNullOrEmpty(expanedTermPath))
            {
                string[] expandedLines = FileOperators.ReadFileLines(expanedTermPath);
                foreach (string expandedLine in expandedLines)
                {
                    int colonIndex = expandedLine.IndexOf(':');
                    if (colonIndex > 0)
                    {
                        string compName = expandedLine.Substring(0, colonIndex);
                        if (compTerms.ContainsKey(compName))
                        {
                            List <string> existedTerms = compTerms[compName];
                            string        termStr      = expandedLine.Substring(colonIndex + 1);
                            string[]      terms        = termStr.Split(',');
                            foreach (string term in terms)
                            {
                                if (!existedTerms.Contains(term.Trim()))
                                {
                                    existedTerms.Add(term.Trim());
                                }
                            }
                            compTerms[compName] = existedTerms;
                        }
                        else
                        {
                            Console.WriteLine("Please check the component:" + compName);
                        }
                    }
                }
            }

            return(compTerms);
        }
Example #15
0
        public List <string> SplitSingleParaRemoveUseless(string filePath)
        {
            string[]      fileLines = FileOperators.ReadFileLines(filePath);
            List <string> paras     = new List <string>();

            foreach (string aLine in fileLines)
            {
                string curPara = "";
                string curLine = aLine;
                if (aLine.Contains("e.g."))
                {
                    curLine = aLine.Replace("e.g.", "");
                }
                if (aLine.Contains("i.e.,"))
                {
                    curLine = aLine.Replace("i.e.,", "");
                }
                string[] seperator = { ". " };
                string[] pieces    = curLine.Split(seperator, StringSplitOptions.RemoveEmptyEntries);
                foreach (string piece in pieces)
                {
                    if (piece.Contains("appendix") || piece.Contains("figure") || piece.Contains("table") || piece.Contains("section") || piece.Contains("standard") || piece.Contains("version") || piece.Contains("http") || piece.Contains("www"))
                    {
                        continue;
                    }
                    string filteredPiece = piece.Trim();
                    for (int i = 0; i < piece.Length; i++)
                    {
                        char curChar = piece[i];
                        if (curChar > 'z' || curChar < 'a')
                        {
                            continue;
                        }
                        else
                        {
                            filteredPiece = piece.Substring(i).Trim();
                            break;
                        }
                    }

                    char endChar = filteredPiece[filteredPiece.Length - 1];
                    if (endChar >= '0' && endChar <= '9')
                    {
                        continue;
                    }

                    if (IsSynonym(filteredPiece)) //if the sentence is to define the synonym, ignore
                    {
                        continue;
                    }

                    if (IsSectionHead(filteredPiece))
                    {
                        continue;
                    }


                    if (ContainEnoughInfo(filteredPiece) && !filteredPiece.Contains("the following") && !filteredPiece.StartsWith("commentary") && !filteredPiece.StartsWith("see") && !filteredPiece.StartsWith("of") && !filteredPiece.StartsWith("description:") && !filteredPiece.StartsWith("content"))
                    {
                        curPara += filteredPiece + ". ";
                    }
                }
                if (curPara.Length > 0)
                {
                    paras.Add(curPara);
                }
            }

            return(paras);
        }