/// <summary> /// Remove stopwords from string. /// </summary> /// //data pre-processing full function public static string DataPreprocess(string data) { //Remove all symbol from summaryAndDesc. string pattern = @"(\d+|\@|\.|\,|\&|\'|\(|\)|<|>|#|{|}|\[|\]|%|\||\\|\/)"; //string pattern = @"[^0-9a-zA-Z\._]"; string filtered1 = Regex.Replace(data, pattern, string.Empty, RegexOptions.IgnoreCase); string pattern2 = @"[^a-zA-Z0-9]+"; string filtered = Regex.Replace(filtered1, pattern2, " ", RegexOptions.IgnoreCase); //Extract all Camel Case word from summaryAndDesc. string nopascal = ExtractWordFromPascalCase(filtered); //Remove all stop word from summaryAndDesc. string stopWords = RemoveStopwords(nopascal); //Stemming all word. //PorterStemmer stem = new PorterStemmer(); //string stemming = stem.stemTerm(stopWords); Porter2 stem = new Porter2(); string stemming = stem.stem(stopWords); //Distinct & orderby all keywords. var keywords = string.Join(" ", stemming.Split(' ').Distinct(StringComparer.CurrentCultureIgnoreCase)); return(keywords); }
/* store the stemmed file as the original hierarchy */ public void FileDirStemming(string sourceFile, string stemmedPath) { if (IsDirectory(sourceFile)) { string[] subDirs = Directory.GetDirectories(sourceFile); foreach (string subDir in subDirs) { string subDirName = FileNameParse.GetFileName(subDir); string storePath = stemmedPath + "\\" + subDirName; FileDirStemming(subDir, storePath); } string[] subFiles = Directory.GetFiles(sourceFile); foreach (string subFile in subFiles) { string[] fileLines = FileOperators.ReadFileLines(subFile); string fileName = FileNameParse.GetFileName(subFile); string stemmedContent = ""; foreach (string fileLine in fileLines) { string stemmedLine = ""; string[] separators = { " ", "," }; string[] terms = fileLine.Split(separators, StringSplitOptions.RemoveEmptyEntries); Porter2 porter = new Porter2(); foreach (string term in terms) { string stemmedTerm = porter.stem(term); stemmedLine += stemmedTerm + " "; } stemmedContent += stemmedLine + "\r\n"; } FileOperators.FileWrite(stemmedPath + "\\" + fileName, stemmedContent); } } }
private List <string> GetTopTerms(List <string> currentQueryTerms, Dictionary <string, double> queryWeights, int numOfResults) { Porter2 porterStemmer = new Porter2(); List <string> newQueryTerms = new List <string>(); newQueryTerms.AddRange(currentQueryTerms); var dictionaryList = queryWeights.ToList(); var sortedDictionary = dictionaryList.Select(kvp => kvp).OrderByDescending(kvp => kvp.Value).ToList(); int i = 0; foreach (var term in sortedDictionary) { if (wordsToSkip.Contains(term.Key) || currentQueryTerms.Contains(term.Key) || currentQueryTerms.Contains(porterStemmer.stem(term.Key))) { continue; } newQueryTerms.Add(term.Key); currentQueryTerms.Add(porterStemmer.stem(term.Key)); i++; if (i >= numOfResults) { break; } } return(newQueryTerms); }
private string StemSentence(string oriSentence) { string stemmedSentence = ""; List <string> sentenceTerms = oriSentence.Split(' ').ToList(); Porter2 porter = new Porter2(); foreach (string term in sentenceTerms) //calculate the number of valid words { string stemmedTerm = porter.stem(term); stemmedSentence += stemmedTerm + " "; } return(stemmedSentence); }
private List <int> TFInOneText(string text, List <string> targetTerms) { List <int> tfs = new List <int>(); Regex wordRegex = new Regex("\\W"); string removePuncSentence = wordRegex.Replace(text, " "); string copyText = removePuncSentence; string[] textTerms = text.Split(' '); Dictionary <string, int> stemmedTerms = new Dictionary <string, int>(); Dictionary <string, string> oriStemMap = new Dictionary <string, string>(); Porter2 porter = new Porter2(); foreach (string termInText in textTerms) { if (oriStemMap.ContainsKey(termInText)) { string stemmed = oriStemMap[termInText]; stemmedTerms[stemmed]++; } else { string stemmedTerm = porter.stem(termInText); if (stemmedTerms.ContainsKey(stemmedTerm)) { stemmedTerms[stemmedTerm]++; oriStemMap.Add(termInText, stemmedTerm); } else { oriStemMap.Add(termInText, stemmedTerm); stemmedTerms.Add(stemmedTerm, 1); } } } foreach (string targetTerm in targetTerms) { if (stemmedTerms.ContainsKey(targetTerm)) { tfs.Add(stemmedTerms[targetTerm]); } else { tfs.Add(0); } } return(tfs); }
public FullAnalysisResult Analyze(string input) { var result = new FullAnalysisResult(); var stemmer = new Porter2(); var wordSplit = input.Split(new[] { ' ', ',', '"', ':' }, StringSplitOptions.RemoveEmptyEntries); var sentenceIndex = 0; foreach (var word in wordSplit) { Action nextSentenceIndexIfApplicable = () => { }; var currentWord = word.ToLowerInvariant(); if (currentWord.EndsWith(".")) { currentWord = word.Remove(word.Length - 1, 1); nextSentenceIndexIfApplicable = () => { sentenceIndex++; }; if (String.IsNullOrWhiteSpace(currentWord)) { nextSentenceIndexIfApplicable(); continue; } } if (WordExceptions.Contains(currentWord)) { continue; } var stem = stemmer.Stem(currentWord); result.LogOccurrence(stem, sentenceIndex); nextSentenceIndexIfApplicable(); } return(result); }
public void Init() { subject = new Porter2(); }
public float SentenceScore(string sentence, Dictionary <string, float> compTerms) { float score = 0.0f; List <string> sentenceTerms = sentence.Split(' ').ToList(); int sentenceLength = 0; RegexOptions option = RegexOptions.None; Regex regex = new Regex(@"[a-zA-Z]", option); Regex wordRex = new Regex(@"\W", option); List <string> qualityTermList = new List <string>(); float numerator = 0.0f; if (sentenceTerms.Count > SENTENCE_LEN) //if the sentence is too long, we ignore it. because there must be some extraction problems, table or diagram. { return(score); } string stemmedSentence = ""; Porter2 porter = new Porter2(); foreach (string term in sentenceTerms) //calculate the number of valid words { if (regex.Match(term).Success) { string filteredTerm = wordRex.Replace(term, ""); string stemmedTerm = porter.stem(filteredTerm); stemmedSentence += stemmedTerm + " "; sentenceLength++; } } string filteredStr = stemmedSentence; foreach (string compTerm in compTerms.Keys) //test if all of the terms are stemmed { float weight = compTerms[compTerm]; string filteredTerm = compTerm.Replace("-", ""); filteredTerm = porter.stem(filteredTerm); string tmpFilteredStr = filteredStr.Replace(filteredTerm + " ", " "); int freq = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length); if (freq > 0) { //edit for component summary (without quality) numerator += freq * weight; //adjust here, if a sentence contains more quality info, it should be more relevant to a quality. But it should consist of component info. } filteredStr = tmpFilteredStr; tmpFilteredStr = filteredStr.Replace(filteredTerm + "s ", " "); freq = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length + 1); if (freq > 0) { numerator += freq * weight; } filteredStr = tmpFilteredStr; tmpFilteredStr = filteredStr.Replace(filteredTerm + ")", " "); freq = (filteredStr.Length - tmpFilteredStr.Length) / filteredTerm.Length; if (freq > 0) { numerator += freq * weight; } filteredStr = tmpFilteredStr; } score = numerator / sentenceLength; return(score); }
public float SentenceScore(string sentence, List <string> compTerms) { float score = 0.0f; int sentenceLength = 0; RegexOptions option = RegexOptions.None; Regex regex = new Regex(@"[a-zA-Z]", option); Regex wordRex = new Regex(@"\W", option); string filteredSentence = wordRex.Replace(sentence, " "); List <string> sentenceTerms = filteredSentence.Split(' ').ToList(); List <string> qualityTermList = new List <string>(); float numerator = 0.0f; //bool componentSign = false; //the final sentence should contain quality and component terms at the same time. Under this condition, the value should be 2 // int qualityTermCount = 0; if (sentenceTerms.Count > SENTENCE_LEN) //if the sentence is too long, we ignore it. because there must be some extraction problems, table or diagram. { return(score); } string stemmedSentence = ""; Porter2 porter = new Porter2(); foreach (string term in sentenceTerms) //calculate the number of valid words { if (regex.Match(term).Success) { string filteredTerm = wordRex.Replace(term, ""); string stemmedTerm = porter.stem(filteredTerm); stemmedSentence += stemmedTerm + " "; sentenceLength++; } } int qualityTermCovered = 0; //try to add the impact of the diversity of quality terms string filteredStr = stemmedSentence; filteredStr = stemmedSentence; //////////////////////////////////////////////////////////////////////////////// //if generate the summary for component only //////////////////////////////////////////////////////////////////////////////// foreach (string compTerm in compTerms) //test if all of the terms are stemmed { string filteredTerm = compTerm.Replace("-", ""); filteredTerm = porter.stem(filteredTerm); string tmpFilteredStr = filteredStr.Replace(filteredTerm + " ", " "); int freq = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length); if (freq > 0) { numerator += freq; } filteredStr = tmpFilteredStr; tmpFilteredStr = filteredStr.Replace(filteredTerm + "s ", " "); freq = (filteredStr.Length - tmpFilteredStr.Length) / (filteredTerm.Length + 1); if (freq > 0) { numerator += freq; } filteredStr = tmpFilteredStr; tmpFilteredStr = filteredStr.Replace(filteredTerm + ")", " "); freq = (filteredStr.Length - tmpFilteredStr.Length) / filteredTerm.Length; if (freq > 0) { numerator += freq; } filteredStr = tmpFilteredStr; } score = numerator / sentenceLength; return(score); }