Sentence, TextSummarizationAlgos.DocumentProcessing C# (CSharp)代码示例

示例#1

0

显示文件

文件： SummaryUtil.cs 项目： shereefsakr/arabictext-summarizer

        public static string SummarizeByCompressionRatio(Sentence[] sents, double ratio)
        {
            if (!(ratio > 0 && ratio <= 1))
                throw new ArgumentOutOfRangeException("ratio");

            int count = (int)(sents.Length * ratio);

            return (SummarizeBySentenceCount(sents, count));
        }

示例#2

0

显示文件

文件： SummaryUtil.cs 项目： shereefsakr/arabictext-summarizer

        public static HashSet<string> getCommonWords(Sentence firstSentence, Sentence secondSentence)
        {
            HashSet<string> commonWords = new HashSet<string>();

            foreach (string aWord in firstSentence.words)
            {
                if (secondSentence.words.Contains(aWord))
                    commonWords.Add(aWord);
            }

            return (commonWords);
        }

示例#3

0

显示文件

文件： SummaryUtil.cs 项目： shereefsakr/arabictext-summarizer

        public static string getText(Sentence[] sents, int count)
        {
            string genSummary = "";
            int numSents = count;
            if (sents.Length < numSents)
                numSents = sents.Length;

            for (int i = 0; i < numSents; i++)
            {
                genSummary += sents[i].fullText + Environment.NewLine;
            }

            return (genSummary);
        }

示例#4

0

显示文件

文件： SummaryUtil.cs 项目： shereefsakr/arabictext-summarizer

        public static string SummarizeBySentenceCount(Sentence[] sents, int count)
        {
            //Sentence[] originalSents = new Sentence[sents.Length];
            //Array.Copy(sents, originalSents, sents.Length);
            Array.Sort(sents, new SentenceComparer());
            Array.Reverse(sents);

            foreach (Sentence sent in sents)
            {
                Trace.write(sent.fullText);
                Trace.write("Weight : " + sent.weight);
            }

            return (getText(sents, count));
        }

示例#5

0

显示文件

文件： LexRankCommon.cs 项目： shereefsakr/arabictext-summarizer

        public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence)
        {
            double idfModifiedCosine = 0;

            HashSet<string> commonWords = new HashSet<string>();

            foreach (string aWord in firstSentence.words)
            {
                if (secondSentence.words.Contains(aWord))
                    commonWords.Add(aWord);
            }

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2));
            }

            double denominator1 = 0;

            foreach (string aWord in firstSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in secondSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            idfModifiedCosine = numerator / (denominator1 * denominator2);

            return (idfModifiedCosine);
        }

示例#6

0

显示文件

文件： KunaDocumentProcessor.cs 项目： shereefsakr/arabictext-summarizer

        public override Document process(string docText)
        {
            Document doc = new Document();

            doc.originalText = docText;

            // Begin : Preprocessing

            // Remove Extra Characters and Words.
            docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline);
            docText = Regex.Replace(docText, @"\(يتبع\)", "");
            docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline);

            // Normalize Characters
            docText = Regex.Replace(docText, "أ|إ", "ا");
            docText = Regex.Replace(docText, "ى", "ي");

            doc.fullText = docText;
            // End : Preprocessing

            string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value;

            string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            //debugClipboard(splits);

            ArrayList sentences = new ArrayList();

            foreach (string split in splits)
            {
                string text = split;
                Sentence sent = new Sentence();

                sent.fullText = text;

                text = Regex.Replace(text, @"^\s+", "");
                text = Regex.Replace(text, @"\s+$", "");

                // Remove Stop Words
                text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text);

                string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace);
                //sent.words = wordSplits;

                ArrayList words = new ArrayList();
                Hashtable wordsCount = new Hashtable();
                foreach (string word in wordSplits)
                {
                    words.Add(word);
                    if (wordsCount[word] == null)
                        wordsCount[word] = 1;
                    else
                        wordsCount[word] = (int)wordsCount[word] + 1 ;
                }
                sent.words = words;
                sent.wordsCount = wordsCount;

                // is it a title
                if (split == splits[0] && !Regex.IsMatch(text, @"(.*)كونا(.*)"))
                    doc.title = sent;
                else
                    sentences.Add(sent);
            }

            doc.sentences = sentences;

            return doc;
        }

示例#7

0

显示文件

文件： LexRankCommon.cs 项目： shereefsakr/arabictext-summarizer

        /*
        public static double[][] generateIdfModifiedCosineMatrix(DocsStatistics docStats, ArrayList sentences)
        {
            double[][] idfModifiedCosine = new double[sentences.Count][];

            for (int i = 0; i < sentences.Count; i++)
            {
                idfModifiedCosine[i] = new double[sentences.Count];
            }

            for (int i = 0; i < sentences.Count; i++)
            {
                Sentence firstSent = (Sentence)sentences[i];

                for (int j = 0; j < sentences.Count; j++)
                {
                    // same sentence then 1
                    //*
                    if (i == j)
                    {
                        idfModifiedCosine[i][j] = 1;
                        continue;
                    }
                    //* /

                    // has been processed before
                    if (idfModifiedCosine[i][j] != 0)
                        continue;

                    Sentence secondSent = (Sentence)sentences[j];

                    idfModifiedCosine[i][j] = idfModifiedCos(docStats, firstSent, secondSent);
                    idfModifiedCosine[j][i] = idfModifiedCosine[i][j];
                }
            }

            return (idfModifiedCosine);
        }

        public static double idfModifiedCos(DocsStatistics docStats, Sentence firstSentence, Sentence secondSentence)
        {
            double idfModifiedCosine = 0;

            HashSet<string> commonWords = new HashSet<string>();

            foreach (string aWord in firstSentence.words)
            {
                if (secondSentence.words.Contains(aWord))
                    commonWords.Add(aWord);
            }

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf(docStats, aWord), 2));
            }

            double denominator1 = 0;

            foreach (string aWord in firstSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf(docStats, aWord), 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in secondSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf(docStats, aWord), 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            idfModifiedCosine = numerator / (denominator1 * denominator2);

            return (idfModifiedCosine);
        }
        //*/
        public static double termFrequency(Sentence sent, string word)
        {
            //double tf = sent.wordsCount[firstWord] == null ? 0 : ((int)sent.wordsCount[firstWord] / sent.words.Length);
            double tf = sent.wordsCount[word] == null ? 0 : (int)sent.wordsCount[word];

            return (tf);
        }

示例#8

0

显示文件

文件： CentroidAlgorithm2.cs 项目： shereefsakr/arabictext-summarizer

        private static double redundancyPenalty(Sentence firstSentence, Sentence secondSentence)
        {
            double redundancy = 0;
            HashSet<string> commonWords = SummaryUtil.getCommonWords(firstSentence, secondSentence);

            redundancy = (double)(2 * commonWords.Count) / (double)(firstSentence.words.Count + secondSentence.words.Count);

            return (redundancy);
        }

示例#9

0

显示文件

文件： KunaDocumentProcessor.cs 项目： shereefsakr/arabictext-summarizer

        override public Document process(string docText)
        {
            Document doc = new Document();

            doc.originalText = docText;

            // Begin : Preprocessing

            // Remove Extra Characters and Words.
            docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline);
            docText = Regex.Replace(docText, @"\(يتبع\)", "");
            docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline);

            // Normalize Characters
            docText = Regex.Replace(docText, "أ|إ", "ا");
            docText = Regex.Replace(docText, "ى", "ي");

            doc.fullText = docText;
            // End : Preprocessing

            string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value;

            string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            //debugClipboard(splits);

            ArrayList sentences = new ArrayList();

            foreach (string split in splits)
            {
                string   text = split;
                Sentence sent = new Sentence();

                sent.fullText = text;

                text = Regex.Replace(text, @"^\s+", "");
                text = Regex.Replace(text, @"\s+$", "");

                // Remove Stop Words
                text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text);

                string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace);
                //sent.words = wordSplits;

                ArrayList words      = new ArrayList();
                Hashtable wordsCount = new Hashtable();
                foreach (string word in wordSplits)
                {
                    words.Add(word);
                    if (wordsCount[word] == null)
                    {
                        wordsCount[word] = 1;
                    }
                    else
                    {
                        wordsCount[word] = (int)wordsCount[word] + 1;
                    }
                }
                sent.words      = words;
                sent.wordsCount = wordsCount;

                // is it a title
                if (split == splits[0] && !Regex.IsMatch(text, @"(.*)كونا(.*)"))
                {
                    doc.title = sent;
                }
                else
                {
                    sentences.Add(sent);
                }
            }

            doc.sentences = sentences;

            return(doc);
        }

示例#10

0

显示文件

文件： LakhasAlgorithm.cs 项目： shereefsakr/arabictext-summarizer

        public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent)
        {
            Trace.write(sent.fullText);
            double weight = 0;

            // 1: ScLead
            double sclead = 0;

            if (sent == doc.sentences[0])
                sclead = 2;
            else
                sclead = 1;

            Trace.write("SCLead : " + sclead);

            // 2: ScTitle
            double sctitle = 0;
            foreach (string aWord in sent.words)
            {
                //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal);
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                if (doc.title != null)
                {
                    if (doc.title.words.ToArray().Contains(aWord))
                        sctitle += (2 * tf);
                }
            }

            Trace.write("SCTitle : " + sctitle);

            // 3: sccue
            double sccue = 0;

            foreach (string aWord in sent.words)
            {
                if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord))
                {
                    double tf = termFrequency(sent, aWord);

                    sccue += tf;
                }
            }

            Trace.write("SCCue : " + sccue);

            // 4: sctfidf
            double sctfidf = 0;

            foreach (string aWord in sent.words)
            {
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                //if (docStats.wordRefs[aWord] != null && tf != 0)
                if (tf != 0)
                    //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count));
                    sctfidf += (((tf - 1) / tf) * idf.get(aWord));
            }

            //sctfidf = sctfidf / docStats.sentCount;
            //sctfidf = sctfidf / doc.sentences.Count;
            //sctfidf = sctfidf / sent.words.Length;
            sctfidf = sctfidf / sent.words.Count;

            Trace.write("SCTFIDF : " + sctfidf);

            weight = sclead + sctitle + sccue + sctfidf;

            sent.weight = weight;

            Trace.write("Weight : " + weight);

            return (weight);
        }

示例#11

0

显示文件

文件： DocumentProcessor.cs 项目： shereefsakr/arabictext-summarizer

        public virtual Document process(string docText)
        {
            Document doc = new Document();

            doc.originalText = docText;

            // Begin : Preprocessing

            // Remove Extra Characters and Words.
            /*
            docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline);
            docText = Regex.Replace(docText, @"\(يتبع\)", "");
            docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline);
            //*/

            // Normalize Characters
            docText = Regex.Replace(docText, "أ|إ", "ا");
            docText = Regex.Replace(docText, "ى", "ي");

            doc.fullText = docText;
            // End : Preprocessing

            //string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value;

            string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            //debugClipboard(splits);

            ArrayList sentences = new ArrayList();

            foreach (string split in splits)
            {
                string text = split;

                if (text == null)
                    continue;

                if (text.Trim().Equals(""))
                    continue;

                Sentence sent = new Sentence();

                sent.fullText = text;

                text = Regex.Replace(text, @"^\s+", "");
                text = Regex.Replace(text, @"\s+$", "");

                // Remove Stop Words
                text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text);

                // Lemmatizer
                /*
                Trace.write("Before lemmatization");
                Trace.write(text);
                //*/
                text = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH).replace(text);
                /*
                Trace.write("After lemmatization");
                Trace.write(text);
                //*/

                string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace);

                //sent.words = wordSplits;

                ArrayList words = new ArrayList();
                Hashtable wordsCount = new Hashtable();
                Regex validWordRegex = new Regex(@"[\u0600-\u06FF\u0750-\u076D]", RegexOptions.Compiled);
                Regex toRemove = new Regex(@"[0-9\u066B\u066C\u060C]", RegexOptions.Compiled);
                int sentOrder = 0;

                foreach (string word in wordSplits)
                {
                    if (!validWordRegex.IsMatch(word))
                        continue;

                    string afterRemoval = toRemove.Replace(word, "");

                    if (afterRemoval.Length < 2)
                        continue;

                    words.Add(afterRemoval);
                    if (wordsCount[afterRemoval] == null)
                        wordsCount[afterRemoval] = 1;
                    else
                        wordsCount[afterRemoval] = (int)wordsCount[afterRemoval] + 1;
                }
                sent.words = words;
                sent.wordsCount = wordsCount;

                // is it a title
                // Compare references not values
                if ((object)split == (object)splits[0])
                    doc.title = sent;
                else
                {
                    sent.order = ++sentOrder;
                    sentences.Add(sent);
                }
            }

            doc.sentences = sentences;

            return doc;
        }

示例#12

0

显示文件

文件： DocumentProcessor.cs 项目： shereefsakr/arabictext-summarizer

        virtual public Document process(string docText)
        {
            Document doc = new Document();

            doc.originalText = docText;

            // Begin : Preprocessing

            // Remove Extra Characters and Words.

            /*
             * docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline);
             * docText = Regex.Replace(docText, @"\(يتبع\)", "");
             * docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline);
             * //*/

            // Normalize Characters
            docText = Regex.Replace(docText, "أ|إ", "ا");
            docText = Regex.Replace(docText, "ى", "ي");

            doc.fullText = docText;
            // End : Preprocessing

            //string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value;

            string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            //debugClipboard(splits);

            ArrayList sentences = new ArrayList();

            foreach (string split in splits)
            {
                string text = split;

                if (text == null)
                {
                    continue;
                }

                if (text.Trim().Equals(""))
                {
                    continue;
                }

                Sentence sent = new Sentence();

                sent.fullText = text;

                text = Regex.Replace(text, @"^\s+", "");
                text = Regex.Replace(text, @"\s+$", "");

                // Remove Stop Words
                text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text);

                // Lemmatizer

                /*
                 * Trace.write("Before lemmatization");
                 * Trace.write(text);
                 * //*/
                text = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH).replace(text);

                /*
                 * Trace.write("After lemmatization");
                 * Trace.write(text);
                 * //*/

                string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace);

                //sent.words = wordSplits;

                ArrayList words          = new ArrayList();
                Hashtable wordsCount     = new Hashtable();
                Regex     validWordRegex = new Regex(@"[\u0600-\u06FF\u0750-\u076D]", RegexOptions.Compiled);
                Regex     toRemove       = new Regex(@"[0-9\u066B\u066C\u060C]", RegexOptions.Compiled);
                int       sentOrder      = 0;

                foreach (string word in wordSplits)
                {
                    if (!validWordRegex.IsMatch(word))
                    {
                        continue;
                    }

                    string afterRemoval = toRemove.Replace(word, "");

                    if (afterRemoval.Length < 2)
                    {
                        continue;
                    }

                    words.Add(afterRemoval);
                    if (wordsCount[afterRemoval] == null)
                    {
                        wordsCount[afterRemoval] = 1;
                    }
                    else
                    {
                        wordsCount[afterRemoval] = (int)wordsCount[afterRemoval] + 1;
                    }
                }
                sent.words      = words;
                sent.wordsCount = wordsCount;

                // is it a title
                // Compare references not values
                if ((object)split == (object)splits[0])
                {
                    doc.title = sent;
                }
                else
                {
                    sent.order = ++sentOrder;
                    sentences.Add(sent);
                }
            }

            doc.sentences = sentences;

            return(doc);
        }

C# (CSharp) TextSummarizationAlgos.DocumentProcessing Sentence示例