public static string SummarizeByCompressionRatio(Sentence[] sents, double ratio) { if (!(ratio > 0 && ratio <= 1)) throw new ArgumentOutOfRangeException("ratio"); int count = (int)(sents.Length * ratio); return (SummarizeBySentenceCount(sents, count)); }
public static HashSet<string> getCommonWords(Sentence firstSentence, Sentence secondSentence) { HashSet<string> commonWords = new HashSet<string>(); foreach (string aWord in firstSentence.words) { if (secondSentence.words.Contains(aWord)) commonWords.Add(aWord); } return (commonWords); }
public static string getText(Sentence[] sents, int count) { string genSummary = ""; int numSents = count; if (sents.Length < numSents) numSents = sents.Length; for (int i = 0; i < numSents; i++) { genSummary += sents[i].fullText + Environment.NewLine; } return (genSummary); }
public static string SummarizeBySentenceCount(Sentence[] sents, int count) { //Sentence[] originalSents = new Sentence[sents.Length]; //Array.Copy(sents, originalSents, sents.Length); Array.Sort(sents, new SentenceComparer()); Array.Reverse(sents); foreach (Sentence sent in sents) { Trace.write(sent.fullText); Trace.write("Weight : " + sent.weight); } return (getText(sents, count)); }
public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence) { double idfModifiedCosine = 0; HashSet<string> commonWords = new HashSet<string>(); foreach (string aWord in firstSentence.words) { if (secondSentence.words.Contains(aWord)) commonWords.Add(aWord); } double numerator = 0; foreach (string aWord in commonWords) { numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2)); } double denominator1 = 0; foreach (string aWord in firstSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in secondSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2); } denominator2 = Math.Sqrt(denominator2); idfModifiedCosine = numerator / (denominator1 * denominator2); return (idfModifiedCosine); }
public override Document process(string docText) { Document doc = new Document(); doc.originalText = docText; // Begin : Preprocessing // Remove Extra Characters and Words. docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline); docText = Regex.Replace(docText, @"\(يتبع\)", ""); docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline); // Normalize Characters docText = Regex.Replace(docText, "أ|إ", "ا"); docText = Regex.Replace(docText, "ى", "ي"); doc.fullText = docText; // End : Preprocessing string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value; string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase); //debugClipboard(splits); ArrayList sentences = new ArrayList(); foreach (string split in splits) { string text = split; Sentence sent = new Sentence(); sent.fullText = text; text = Regex.Replace(text, @"^\s+", ""); text = Regex.Replace(text, @"\s+$", ""); // Remove Stop Words text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text); string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace); //sent.words = wordSplits; ArrayList words = new ArrayList(); Hashtable wordsCount = new Hashtable(); foreach (string word in wordSplits) { words.Add(word); if (wordsCount[word] == null) wordsCount[word] = 1; else wordsCount[word] = (int)wordsCount[word] + 1 ; } sent.words = words; sent.wordsCount = wordsCount; // is it a title if (split == splits[0] && !Regex.IsMatch(text, @"(.*)كونا(.*)")) doc.title = sent; else sentences.Add(sent); } doc.sentences = sentences; return doc; }
/* public static double[][] generateIdfModifiedCosineMatrix(DocsStatistics docStats, ArrayList sentences) { double[][] idfModifiedCosine = new double[sentences.Count][]; for (int i = 0; i < sentences.Count; i++) { idfModifiedCosine[i] = new double[sentences.Count]; } for (int i = 0; i < sentences.Count; i++) { Sentence firstSent = (Sentence)sentences[i]; for (int j = 0; j < sentences.Count; j++) { // same sentence then 1 //* if (i == j) { idfModifiedCosine[i][j] = 1; continue; } //* / // has been processed before if (idfModifiedCosine[i][j] != 0) continue; Sentence secondSent = (Sentence)sentences[j]; idfModifiedCosine[i][j] = idfModifiedCos(docStats, firstSent, secondSent); idfModifiedCosine[j][i] = idfModifiedCosine[i][j]; } } return (idfModifiedCosine); } public static double idfModifiedCos(DocsStatistics docStats, Sentence firstSentence, Sentence secondSentence) { double idfModifiedCosine = 0; HashSet<string> commonWords = new HashSet<string>(); foreach (string aWord in firstSentence.words) { if (secondSentence.words.Contains(aWord)) commonWords.Add(aWord); } double numerator = 0; foreach (string aWord in commonWords) { numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf(docStats, aWord), 2)); } double denominator1 = 0; foreach (string aWord in firstSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf(docStats, aWord), 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in secondSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf(docStats, aWord), 2); } denominator2 = Math.Sqrt(denominator2); idfModifiedCosine = numerator / (denominator1 * denominator2); return (idfModifiedCosine); } //*/ public static double termFrequency(Sentence sent, string word) { //double tf = sent.wordsCount[firstWord] == null ? 0 : ((int)sent.wordsCount[firstWord] / sent.words.Length); double tf = sent.wordsCount[word] == null ? 0 : (int)sent.wordsCount[word]; return (tf); }
private static double redundancyPenalty(Sentence firstSentence, Sentence secondSentence) { double redundancy = 0; HashSet<string> commonWords = SummaryUtil.getCommonWords(firstSentence, secondSentence); redundancy = (double)(2 * commonWords.Count) / (double)(firstSentence.words.Count + secondSentence.words.Count); return (redundancy); }
override public Document process(string docText) { Document doc = new Document(); doc.originalText = docText; // Begin : Preprocessing // Remove Extra Characters and Words. docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline); docText = Regex.Replace(docText, @"\(يتبع\)", ""); docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline); // Normalize Characters docText = Regex.Replace(docText, "أ|إ", "ا"); docText = Regex.Replace(docText, "ى", "ي"); doc.fullText = docText; // End : Preprocessing string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value; string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase); //debugClipboard(splits); ArrayList sentences = new ArrayList(); foreach (string split in splits) { string text = split; Sentence sent = new Sentence(); sent.fullText = text; text = Regex.Replace(text, @"^\s+", ""); text = Regex.Replace(text, @"\s+$", ""); // Remove Stop Words text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text); string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace); //sent.words = wordSplits; ArrayList words = new ArrayList(); Hashtable wordsCount = new Hashtable(); foreach (string word in wordSplits) { words.Add(word); if (wordsCount[word] == null) { wordsCount[word] = 1; } else { wordsCount[word] = (int)wordsCount[word] + 1; } } sent.words = words; sent.wordsCount = wordsCount; // is it a title if (split == splits[0] && !Regex.IsMatch(text, @"(.*)كونا(.*)")) { doc.title = sent; } else { sentences.Add(sent); } } doc.sentences = sentences; return(doc); }
public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent) { Trace.write(sent.fullText); double weight = 0; // 1: ScLead double sclead = 0; if (sent == doc.sentences[0]) sclead = 2; else sclead = 1; Trace.write("SCLead : " + sclead); // 2: ScTitle double sctitle = 0; foreach (string aWord in sent.words) { //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal); //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); if (doc.title != null) { if (doc.title.words.ToArray().Contains(aWord)) sctitle += (2 * tf); } } Trace.write("SCTitle : " + sctitle); // 3: sccue double sccue = 0; foreach (string aWord in sent.words) { if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord)) { double tf = termFrequency(sent, aWord); sccue += tf; } } Trace.write("SCCue : " + sccue); // 4: sctfidf double sctfidf = 0; foreach (string aWord in sent.words) { //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); //if (docStats.wordRefs[aWord] != null && tf != 0) if (tf != 0) //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count)); sctfidf += (((tf - 1) / tf) * idf.get(aWord)); } //sctfidf = sctfidf / docStats.sentCount; //sctfidf = sctfidf / doc.sentences.Count; //sctfidf = sctfidf / sent.words.Length; sctfidf = sctfidf / sent.words.Count; Trace.write("SCTFIDF : " + sctfidf); weight = sclead + sctitle + sccue + sctfidf; sent.weight = weight; Trace.write("Weight : " + weight); return (weight); }
public virtual Document process(string docText) { Document doc = new Document(); doc.originalText = docText; // Begin : Preprocessing // Remove Extra Characters and Words. /* docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline); docText = Regex.Replace(docText, @"\(يتبع\)", ""); docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline); //*/ // Normalize Characters docText = Regex.Replace(docText, "أ|إ", "ا"); docText = Regex.Replace(docText, "ى", "ي"); doc.fullText = docText; // End : Preprocessing //string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value; string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase); //debugClipboard(splits); ArrayList sentences = new ArrayList(); foreach (string split in splits) { string text = split; if (text == null) continue; if (text.Trim().Equals("")) continue; Sentence sent = new Sentence(); sent.fullText = text; text = Regex.Replace(text, @"^\s+", ""); text = Regex.Replace(text, @"\s+$", ""); // Remove Stop Words text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text); // Lemmatizer /* Trace.write("Before lemmatization"); Trace.write(text); //*/ text = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH).replace(text); /* Trace.write("After lemmatization"); Trace.write(text); //*/ string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace); //sent.words = wordSplits; ArrayList words = new ArrayList(); Hashtable wordsCount = new Hashtable(); Regex validWordRegex = new Regex(@"[\u0600-\u06FF\u0750-\u076D]", RegexOptions.Compiled); Regex toRemove = new Regex(@"[0-9\u066B\u066C\u060C]", RegexOptions.Compiled); int sentOrder = 0; foreach (string word in wordSplits) { if (!validWordRegex.IsMatch(word)) continue; string afterRemoval = toRemove.Replace(word, ""); if (afterRemoval.Length < 2) continue; words.Add(afterRemoval); if (wordsCount[afterRemoval] == null) wordsCount[afterRemoval] = 1; else wordsCount[afterRemoval] = (int)wordsCount[afterRemoval] + 1; } sent.words = words; sent.wordsCount = wordsCount; // is it a title // Compare references not values if ((object)split == (object)splits[0]) doc.title = sent; else { sent.order = ++sentOrder; sentences.Add(sent); } } doc.sentences = sentences; return doc; }
virtual public Document process(string docText) { Document doc = new Document(); doc.originalText = docText; // Begin : Preprocessing // Remove Extra Characters and Words. /* * docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline); * docText = Regex.Replace(docText, @"\(يتبع\)", ""); * docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline); * //*/ // Normalize Characters docText = Regex.Replace(docText, "أ|إ", "ا"); docText = Regex.Replace(docText, "ى", "ي"); doc.fullText = docText; // End : Preprocessing //string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value; string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase); //debugClipboard(splits); ArrayList sentences = new ArrayList(); foreach (string split in splits) { string text = split; if (text == null) { continue; } if (text.Trim().Equals("")) { continue; } Sentence sent = new Sentence(); sent.fullText = text; text = Regex.Replace(text, @"^\s+", ""); text = Regex.Replace(text, @"\s+$", ""); // Remove Stop Words text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text); // Lemmatizer /* * Trace.write("Before lemmatization"); * Trace.write(text); * //*/ text = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH).replace(text); /* * Trace.write("After lemmatization"); * Trace.write(text); * //*/ string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace); //sent.words = wordSplits; ArrayList words = new ArrayList(); Hashtable wordsCount = new Hashtable(); Regex validWordRegex = new Regex(@"[\u0600-\u06FF\u0750-\u076D]", RegexOptions.Compiled); Regex toRemove = new Regex(@"[0-9\u066B\u066C\u060C]", RegexOptions.Compiled); int sentOrder = 0; foreach (string word in wordSplits) { if (!validWordRegex.IsMatch(word)) { continue; } string afterRemoval = toRemove.Replace(word, ""); if (afterRemoval.Length < 2) { continue; } words.Add(afterRemoval); if (wordsCount[afterRemoval] == null) { wordsCount[afterRemoval] = 1; } else { wordsCount[afterRemoval] = (int)wordsCount[afterRemoval] + 1; } } sent.words = words; sent.wordsCount = wordsCount; // is it a title // Compare references not values if ((object)split == (object)splits[0]) { doc.title = sent; } else { sent.order = ++sentOrder; sentences.Add(sent); } } doc.sentences = sentences; return(doc); }