/// <summary>
    /// Remove stopwords from string.
    /// </summary>
    ///
    //data pre-processing full function

    public static string DataPreprocess(string data)
    {
        //Remove all symbol from summaryAndDesc.
        string pattern = @"(\d+|\@|\.|\,|\&|\'|\(|\)|<|>|#|{|}|\[|\]|%|\||\\|\/)";
        //string pattern = @"[^0-9a-zA-Z\._]";
        string filtered1 = Regex.Replace(data, pattern, string.Empty, RegexOptions.IgnoreCase);
        string pattern2  = @"[^a-zA-Z0-9]+";
        string filtered  = Regex.Replace(filtered1, pattern2, " ", RegexOptions.IgnoreCase);

        //Extract all Camel Case word from summaryAndDesc.
        string nopascal = ExtractWordFromPascalCase(filtered);


        //Remove all stop word from summaryAndDesc.
        string stopWords = RemoveStopwords(nopascal);

        //Stemming all word.
        //PorterStemmer stem = new PorterStemmer();
        //string stemming = stem.stemTerm(stopWords);
        Porter2 stem     = new Porter2();
        string  stemming = stem.stem(stopWords);

        //Distinct & orderby all keywords.
        var keywords = string.Join(" ", stemming.Split(' ').Distinct(StringComparer.CurrentCultureIgnoreCase));

        return(keywords);
    }
예제 #2
0
 /* store the stemmed file as the original hierarchy
  */
 public void FileDirStemming(string sourceFile, string stemmedPath)
 {
     if (IsDirectory(sourceFile))
     {
         string[] subDirs = Directory.GetDirectories(sourceFile);
         foreach (string subDir in subDirs)
         {
             string subDirName = FileNameParse.GetFileName(subDir);
             string storePath  = stemmedPath + "\\" + subDirName;
             FileDirStemming(subDir, storePath);
         }
         string[] subFiles = Directory.GetFiles(sourceFile);
         foreach (string subFile in subFiles)
         {
             string[] fileLines      = FileOperators.ReadFileLines(subFile);
             string   fileName       = FileNameParse.GetFileName(subFile);
             string   stemmedContent = "";
             foreach (string fileLine in fileLines)
             {
                 string   stemmedLine = "";
                 string[] separators  = { " ", "," };
                 string[] terms       = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                 Porter2  porter      = new Porter2();
                 foreach (string term in terms)
                 {
                     string stemmedTerm = porter.stem(term);
                     stemmedLine += stemmedTerm + " ";
                 }
                 stemmedContent += stemmedLine + "\r\n";
             }
             FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent);
         }
     }
 }
        private List <string> GetTopTerms(List <string> currentQueryTerms, Dictionary <string, double> queryWeights, int numOfResults)
        {
            Porter2       porterStemmer = new Porter2();
            List <string> newQueryTerms = new List <string>();

            newQueryTerms.AddRange(currentQueryTerms);
            var dictionaryList   = queryWeights.ToList();
            var sortedDictionary = dictionaryList.Select(kvp => kvp).OrderByDescending(kvp => kvp.Value).ToList();
            int i = 0;

            foreach (var term in sortedDictionary)
            {
                if (wordsToSkip.Contains(term.Key) || currentQueryTerms.Contains(term.Key) || currentQueryTerms.Contains(porterStemmer.stem(term.Key)))
                {
                    continue;
                }
                newQueryTerms.Add(term.Key);
                currentQueryTerms.Add(porterStemmer.stem(term.Key));
                i++;
                if (i >= numOfResults)
                {
                    break;
                }
            }
            return(newQueryTerms);
        }
예제 #4
0
파일: MMRSummary.cs 프로젝트: lawenliu/MaRK
        private string StemSentence(string oriSentence)
        {
            string        stemmedSentence = "";
            List <string> sentenceTerms   = oriSentence.Split(' ').ToList();
            Porter2       porter          = new Porter2();

            foreach (string term in sentenceTerms) //calculate the number of valid words
            {
                string stemmedTerm = porter.stem(term);
                stemmedSentence += stemmedTerm + " ";
            }
            return(stemmedSentence);
        }
예제 #5
0
파일: TF_IDF.cs 프로젝트: lawenliu/MaRK
        private List <int> TFInOneText(string text, List <string> targetTerms)
        {
            List <int> tfs                = new List <int>();
            Regex      wordRegex          = new Regex("\\W");
            string     removePuncSentence = wordRegex.Replace(text, " ");
            string     copyText           = removePuncSentence;

            string[] textTerms = text.Split(' ');
            Dictionary <string, int>    stemmedTerms = new Dictionary <string, int>();
            Dictionary <string, string> oriStemMap   = new Dictionary <string, string>();
            Porter2 porter = new Porter2();

            foreach (string termInText in textTerms)
            {
                if (oriStemMap.ContainsKey(termInText))
                {
                    string stemmed = oriStemMap[termInText];
                    stemmedTerms[stemmed]++;
                }
                else
                {
                    string stemmedTerm = porter.stem(termInText);
                    if (stemmedTerms.ContainsKey(stemmedTerm))
                    {
                        stemmedTerms[stemmedTerm]++;
                        oriStemMap.Add(termInText, stemmedTerm);
                    }
                    else
                    {
                        oriStemMap.Add(termInText, stemmedTerm);
                        stemmedTerms.Add(stemmedTerm, 1);
                    }
                }
            }

            foreach (string targetTerm in targetTerms)
            {
                if (stemmedTerms.ContainsKey(targetTerm))
                {
                    tfs.Add(stemmedTerms[targetTerm]);
                }
                else
                {
                    tfs.Add(0);
                }
            }

            return(tfs);
        }
        public FullAnalysisResult Analyze(string input)
        {
            var result  = new FullAnalysisResult();
            var stemmer = new Porter2();

            var wordSplit     = input.Split(new[] { ' ', ',', '"', ':' }, StringSplitOptions.RemoveEmptyEntries);
            var sentenceIndex = 0;

            foreach (var word in wordSplit)
            {
                Action nextSentenceIndexIfApplicable = () => { };
                var    currentWord = word.ToLowerInvariant();

                if (currentWord.EndsWith("."))
                {
                    currentWord = word.Remove(word.Length - 1, 1);
                    nextSentenceIndexIfApplicable = () => { sentenceIndex++; };

                    if (String.IsNullOrWhiteSpace(currentWord))
                    {
                        nextSentenceIndexIfApplicable();
                        continue;
                    }
                }

                if (WordExceptions.Contains(currentWord))
                {
                    continue;
                }

                var stem = stemmer.Stem(currentWord);

                result.LogOccurrence(stem, sentenceIndex);
                nextSentenceIndexIfApplicable();
            }

            return(result);
        }
예제 #7
0
 public void Init()
 {
     subject = new Porter2();
 }
예제 #8
0
        public float SentenceScore(string sentence, Dictionary <string, float> compTerms)
        {
            float         score           = 0.0f;
            List <string> sentenceTerms   = sentence.Split(' ').ToList();
            int           sentenceLength  = 0;
            RegexOptions  option          = RegexOptions.None;
            Regex         regex           = new Regex(@"[a-zA-Z]", option);
            Regex         wordRex         = new Regex(@"\W", option);
            List <string> qualityTermList = new List <string>();

            float numerator = 0.0f;

            if (sentenceTerms.Count > SENTENCE_LEN) //if the sentence is too long, we ignore it. because there must be some extraction problems, table or diagram.
            {
                return(score);
            }

            string  stemmedSentence = "";
            Porter2 porter          = new Porter2();

            foreach (string term in sentenceTerms) //calculate the number of valid words
            {
                if (regex.Match(term).Success)
                {
                    string filteredTerm = wordRex.Replace(term, "");
                    string stemmedTerm  = porter.stem(filteredTerm);
                    stemmedSentence += stemmedTerm + " ";
                    sentenceLength++;
                }
            }

            string filteredStr = stemmedSentence;

            foreach (string compTerm in compTerms.Keys) //test if all of the terms are stemmed
            {
                float weight = compTerms[compTerm];

                string filteredTerm = compTerm.Replace("-", "");

                filteredTerm = porter.stem(filteredTerm);

                string tmpFilteredStr = filteredStr.Replace(filteredTerm + " ", " ");
                int    freq           = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length);
                if (freq > 0)
                {
                    //edit for component summary (without quality)

                    numerator += freq * weight; //adjust here, if a sentence contains more quality info, it should be more relevant to a quality. But it should consist of component info.
                }

                filteredStr = tmpFilteredStr;

                tmpFilteredStr = filteredStr.Replace(filteredTerm + "s ", " ");
                freq           = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length + 1);
                if (freq > 0)
                {
                    numerator += freq * weight;
                }
                filteredStr = tmpFilteredStr;

                tmpFilteredStr = filteredStr.Replace(filteredTerm + ")", " ");
                freq           = (filteredStr.Length - tmpFilteredStr.Length) / filteredTerm.Length;
                if (freq > 0)
                {
                    numerator += freq * weight;
                }

                filteredStr = tmpFilteredStr;
            }
            score = numerator / sentenceLength;

            return(score);
        }
예제 #9
0
        public float SentenceScore(string sentence, List <string> compTerms)
        {
            float score = 0.0f;


            int           sentenceLength   = 0;
            RegexOptions  option           = RegexOptions.None;
            Regex         regex            = new Regex(@"[a-zA-Z]", option);
            Regex         wordRex          = new Regex(@"\W", option);
            string        filteredSentence = wordRex.Replace(sentence, " ");
            List <string> sentenceTerms    = filteredSentence.Split(' ').ToList();
            List <string> qualityTermList  = new List <string>();

            float numerator = 0.0f;

            //bool componentSign = false; //the final sentence should contain quality and component terms at the same time. Under this condition, the value should be 2

            //    int qualityTermCount = 0;
            if (sentenceTerms.Count > SENTENCE_LEN) //if the sentence is too long, we ignore it. because there must be some extraction problems, table or diagram.
            {
                return(score);
            }

            string  stemmedSentence = "";
            Porter2 porter          = new Porter2();

            foreach (string term in sentenceTerms) //calculate the number of valid words
            {
                if (regex.Match(term).Success)
                {
                    string filteredTerm = wordRex.Replace(term, "");
                    string stemmedTerm  = porter.stem(filteredTerm);
                    stemmedSentence += stemmedTerm + " ";
                    sentenceLength++;
                }
            }

            int    qualityTermCovered = 0; //try to add the impact of the diversity of quality terms
            string filteredStr        = stemmedSentence;

            filteredStr = stemmedSentence;

            ////////////////////////////////////////////////////////////////////////////////
            //if generate the summary for component only
            ////////////////////////////////////////////////////////////////////////////////
            foreach (string compTerm in compTerms) //test if all of the terms are stemmed
            {
                string filteredTerm = compTerm.Replace("-", "");

                filteredTerm = porter.stem(filteredTerm);

                string tmpFilteredStr = filteredStr.Replace(filteredTerm + " ", " ");
                int    freq           = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length);
                if (freq > 0)
                {
                    numerator += freq;
                }

                filteredStr = tmpFilteredStr;

                tmpFilteredStr = filteredStr.Replace(filteredTerm + "s ", " ");
                freq           = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length + 1);
                if (freq > 0)
                {
                    numerator += freq;
                }
                filteredStr = tmpFilteredStr;

                tmpFilteredStr = filteredStr.Replace(filteredTerm + ")", " ");
                freq           = (filteredStr.Length - tmpFilteredStr.Length) / filteredTerm.Length;
                if (freq > 0)
                {
                    numerator += freq;
                }

                filteredStr = tmpFilteredStr;
            }
            score = numerator / sentenceLength;

            return(score);
        }