public void addDocument(Document doc) { HashSet<string> docWords = new HashSet<string>(); //Hashtable docWords = new Hashtable(); foreach (Sentence sent in doc.sentences) { foreach (string currWord in sent.words) { if (this.wordsCount[currWord] == null) this.wordsCount[currWord] = 1; else { this.wordsCount[currWord] = ((int)this.wordsCount[currWord]) + 1; } if (!docWords.Contains(currWord)) { if (this.wordRefsCount[currWord] == null) this.wordRefsCount[currWord] = 1; else this.wordRefsCount[currWord] = ((int)this.wordRefsCount[currWord]) + 1; } docWords.Add(currWord); //docWords[currWord] = 1; /* if (this.wordRefs[currWord] == null) this.wordRefs[currWord] = new HashSet<Document>(); ((HashSet<Document>)this.wordRefs[currWord]).Add(doc); //*/ this.wordTotal++; } this.sentCount++; } this.docCount++; }
public override Document process(string docText) { Document doc = new Document(); doc.originalText = docText; // Begin : Preprocessing // Remove Extra Characters and Words. docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline); docText = Regex.Replace(docText, @"\(يتبع\)", ""); docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline); // Normalize Characters docText = Regex.Replace(docText, "أ|إ", "ا"); docText = Regex.Replace(docText, "ى", "ي"); doc.fullText = docText; // End : Preprocessing string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value; string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase); //debugClipboard(splits); ArrayList sentences = new ArrayList(); foreach (string split in splits) { string text = split; Sentence sent = new Sentence(); sent.fullText = text; text = Regex.Replace(text, @"^\s+", ""); text = Regex.Replace(text, @"\s+$", ""); // Remove Stop Words text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text); string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace); //sent.words = wordSplits; ArrayList words = new ArrayList(); Hashtable wordsCount = new Hashtable(); foreach (string word in wordSplits) { words.Add(word); if (wordsCount[word] == null) wordsCount[word] = 1; else wordsCount[word] = (int)wordsCount[word] + 1 ; } sent.words = words; sent.wordsCount = wordsCount; // is it a title if (split == splits[0] && !Regex.IsMatch(text, @"(.*)كونا(.*)")) doc.title = sent; else sentences.Add(sent); } doc.sentences = sentences; return doc; }
public void addDocument(Document doc) { foreach (Sentence sent in doc.sentences) { string prevWord = null; for (int i = 0; i < sent.words.Count; i++) { string currWord = (string)sent.words[i]; bool isFirst = (prevWord == null); if (this.wordsCount[currWord] == null) this.wordsCount[currWord] = 1; else { this.wordsCount[currWord] = ((int)this.wordsCount[currWord]) + 1; } if (isFirst) { prevWord = currWord; continue; } if (this.wordsBigram[prevWord] == null) this.wordsBigram[prevWord] = new Hashtable(); Hashtable currWordBigram = (Hashtable)this.wordsBigram[prevWord]; if (currWordBigram[currWord] == null) currWordBigram[currWord] = 1; else { currWordBigram[currWord] = ((int)currWordBigram[currWord]) + 1; } prevWord = currWord; } } }
public override string generateSummary(Document newDoc, double compressionRatio) { double[] cTotal = new double[newDoc.sentences.Count]; double[] pTotal = new double[newDoc.sentences.Count]; double[] fTotal = new double[newDoc.sentences.Count]; double cMax = double.MinValue; ArrayList centroids = buildCentroids(this.trainingDocs, IDF.getInstance()); for (int i = 0; i < newDoc.sentences.Count; i++) { Sentence currSent = (Sentence)newDoc.sentences[i]; // Calculate C cTotal[i] = 0; foreach (string word in currSent.words) { /* double tf = termFrequency(docStats, firstWord); double idf = CentroidAlgorithm.idf(docStats, firstWord); cTotal[i] += tf * idf; //*/ cTotal[i] += getCentroidValue(centroids, word); } if (cTotal[i] > cMax) cMax = cTotal[i]; // Calculate F fTotal[i] = 0; foreach (string word in currSent.words) { int wordOccurence = 0; if (newDoc.title.wordsCount[word] != null) { wordOccurence += ((int)newDoc.title.wordsCount[word]); } if (newDoc.sentences.Count > 1) { if (((Sentence)newDoc.sentences[0]).wordsCount[word] != null) { wordOccurence += ((int)((Sentence)newDoc.sentences[0]).wordsCount[word]); } } fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word])); } } // Calculate P for (int i = 0; i < newDoc.sentences.Count; i++) { // Remove + 1 as arrays are zero based. pTotal[i] = ((newDoc.sentences.Count - i) * cMax) / newDoc.sentences.Count; } double maxScore = double.MinValue; for (int i = 0; i < newDoc.sentences.Count; i++) { double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]); ((Sentence)newDoc.sentences[i]).weight = currWeight; if (currWeight > maxScore) maxScore = currWeight; } string genSummary = null; string prevgenSummary = null; do { for (int i = 0; i < newDoc.sentences.Count; i++) { for (int j = 0; j < newDoc.sentences.Count; j++) { if (i >= j) continue; double redundancy = redundancyPenalty((Sentence)newDoc.sentences[i], (Sentence)newDoc.sentences[j]); ((Sentence)newDoc.sentences[j]).weight -= (maxScore * redundancy); } } maxScore = double.MinValue; for (int i = 0; i < newDoc.sentences.Count; i++) { if (((Sentence)newDoc.sentences[i]).weight > maxScore) maxScore = ((Sentence)newDoc.sentences[i]).weight; } Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(new Sentence().GetType()); prevgenSummary = genSummary; genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, this.compressionRatio); } while (!genSummary.Equals(prevgenSummary)); return (genSummary); }
public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent) { Trace.write(sent.fullText); double weight = 0; // 1: ScLead double sclead = 0; if (sent == doc.sentences[0]) sclead = 2; else sclead = 1; Trace.write("SCLead : " + sclead); // 2: ScTitle double sctitle = 0; foreach (string aWord in sent.words) { //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal); //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); if (doc.title != null) { if (doc.title.words.ToArray().Contains(aWord)) sctitle += (2 * tf); } } Trace.write("SCTitle : " + sctitle); // 3: sccue double sccue = 0; foreach (string aWord in sent.words) { if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord)) { double tf = termFrequency(sent, aWord); sccue += tf; } } Trace.write("SCCue : " + sccue); // 4: sctfidf double sctfidf = 0; foreach (string aWord in sent.words) { //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); //if (docStats.wordRefs[aWord] != null && tf != 0) if (tf != 0) //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count)); sctfidf += (((tf - 1) / tf) * idf.get(aWord)); } //sctfidf = sctfidf / docStats.sentCount; //sctfidf = sctfidf / doc.sentences.Count; //sctfidf = sctfidf / sent.words.Length; sctfidf = sctfidf / sent.words.Count; Trace.write("SCTFIDF : " + sctfidf); weight = sclead + sctitle + sccue + sctfidf; sent.weight = weight; Trace.write("Weight : " + weight); return (weight); }
public override string generateSummary(Document newDoc, double compressionRatio) { //Document newDoc = Document.process(newDocText); //Document newDoc = Conf.getDocumentProcessor().process(newDocText); foreach (Sentence aSent in newDoc.sentences) { calcSentenceWeight(IDF.getInstance(), newDoc, aSent); } //object[] sents = newDoc.sentences.ToArray(); Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(typeof(Sentence)); string genSummary = ""; genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio); /* Array.Sort(sents, new SentenceComparer()); Array.Reverse(sents); int numSents = NUM_SENTENCES; if (sents.Length < numSents) numSents = sents.Length; for (int i = 0; i < numSents; i++) { genSummary += ((Sentence)sents[i]).fullText + "\r\n"; } //*/ /* string dbgString = ""; foreach (Sentence aSent in sents) { dbgString += aSent.fullText + "\r\n"; } debugClipboard(dbgString); //*/ return (genSummary); }
public virtual Document process(string docText) { Document doc = new Document(); doc.originalText = docText; // Begin : Preprocessing // Remove Extra Characters and Words. /* docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline); docText = Regex.Replace(docText, @"\(يتبع\)", ""); docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline); //*/ // Normalize Characters docText = Regex.Replace(docText, "أ|إ", "ا"); docText = Regex.Replace(docText, "ى", "ي"); doc.fullText = docText; // End : Preprocessing //string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value; string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase); //debugClipboard(splits); ArrayList sentences = new ArrayList(); foreach (string split in splits) { string text = split; if (text == null) continue; if (text.Trim().Equals("")) continue; Sentence sent = new Sentence(); sent.fullText = text; text = Regex.Replace(text, @"^\s+", ""); text = Regex.Replace(text, @"\s+$", ""); // Remove Stop Words text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text); // Lemmatizer /* Trace.write("Before lemmatization"); Trace.write(text); //*/ text = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH).replace(text); /* Trace.write("After lemmatization"); Trace.write(text); //*/ string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace); //sent.words = wordSplits; ArrayList words = new ArrayList(); Hashtable wordsCount = new Hashtable(); Regex validWordRegex = new Regex(@"[\u0600-\u06FF\u0750-\u076D]", RegexOptions.Compiled); Regex toRemove = new Regex(@"[0-9\u066B\u066C\u060C]", RegexOptions.Compiled); int sentOrder = 0; foreach (string word in wordSplits) { if (!validWordRegex.IsMatch(word)) continue; string afterRemoval = toRemove.Replace(word, ""); if (afterRemoval.Length < 2) continue; words.Add(afterRemoval); if (wordsCount[afterRemoval] == null) wordsCount[afterRemoval] = 1; else wordsCount[afterRemoval] = (int)wordsCount[afterRemoval] + 1; } sent.words = words; sent.wordsCount = wordsCount; // is it a title // Compare references not values if ((object)split == (object)splits[0]) doc.title = sent; else { sent.order = ++sentOrder; sentences.Add(sent); } } doc.sentences = sentences; return doc; }