Document, TextSummarizationAlgos.DocumentProcessing C# (CSharp)代码示例

示例#1

0

显示文件

文件： DocsStatistics.cs 项目： shereefsakr/arabictext-summarizer

        public void addDocument(Document doc)
        {
            HashSet<string> docWords = new HashSet<string>();
            //Hashtable docWords = new Hashtable();

            foreach (Sentence sent in doc.sentences)
            {
                foreach (string currWord in sent.words)
                {
                    if (this.wordsCount[currWord] == null)
                        this.wordsCount[currWord] = 1;
                    else
                    {
                        this.wordsCount[currWord] = ((int)this.wordsCount[currWord]) + 1;
                    }

                    if (!docWords.Contains(currWord))
                    {
                        if (this.wordRefsCount[currWord] == null)
                            this.wordRefsCount[currWord] = 1;
                        else
                            this.wordRefsCount[currWord] = ((int)this.wordRefsCount[currWord]) + 1;
                    }

                    docWords.Add(currWord);
                    //docWords[currWord] = 1;

                    /*
                    if (this.wordRefs[currWord] == null)
                        this.wordRefs[currWord] = new HashSet<Document>();

                    ((HashSet<Document>)this.wordRefs[currWord]).Add(doc);
                    //*/

                    this.wordTotal++;
                }
                this.sentCount++;
            }
            this.docCount++;
        }

示例#2

0

显示文件

文件： KunaDocumentProcessor.cs 项目： shereefsakr/arabictext-summarizer

        public override Document process(string docText)
        {
            Document doc = new Document();

            doc.originalText = docText;

            // Begin : Preprocessing

            // Remove Extra Characters and Words.
            docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline);
            docText = Regex.Replace(docText, @"\(يتبع\)", "");
            docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline);

            // Normalize Characters
            docText = Regex.Replace(docText, "أ|إ", "ا");
            docText = Regex.Replace(docText, "ى", "ي");

            doc.fullText = docText;
            // End : Preprocessing

            string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value;

            string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            //debugClipboard(splits);

            ArrayList sentences = new ArrayList();

            foreach (string split in splits)
            {
                string text = split;
                Sentence sent = new Sentence();

                sent.fullText = text;

                text = Regex.Replace(text, @"^\s+", "");
                text = Regex.Replace(text, @"\s+$", "");

                // Remove Stop Words
                text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text);

                string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace);
                //sent.words = wordSplits;

                ArrayList words = new ArrayList();
                Hashtable wordsCount = new Hashtable();
                foreach (string word in wordSplits)
                {
                    words.Add(word);
                    if (wordsCount[word] == null)
                        wordsCount[word] = 1;
                    else
                        wordsCount[word] = (int)wordsCount[word] + 1 ;
                }
                sent.words = words;
                sent.wordsCount = wordsCount;

                // is it a title
                if (split == splits[0] && !Regex.IsMatch(text, @"(.*)كونا(.*)"))
                    doc.title = sent;
                else
                    sentences.Add(sent);
            }

            doc.sentences = sentences;

            return doc;
        }

示例#3

0

显示文件

文件： UltraSummarizationPreprocessor.cs 项目： shereefsakr/arabictext-summarizer

            public void addDocument(Document doc)
            {
                foreach (Sentence sent in doc.sentences)
                {
                    string prevWord = null;

                    for (int i = 0; i < sent.words.Count; i++)
                    {
                        string currWord = (string)sent.words[i];
                        bool isFirst = (prevWord == null);

                        if (this.wordsCount[currWord] == null)
                            this.wordsCount[currWord] = 1;
                        else
                        {
                            this.wordsCount[currWord] = ((int)this.wordsCount[currWord]) + 1;
                        }

                        if (isFirst)
                        {
                            prevWord = currWord;
                            continue;
                        }

                        if (this.wordsBigram[prevWord] == null)
                            this.wordsBigram[prevWord] = new Hashtable();

                        Hashtable currWordBigram = (Hashtable)this.wordsBigram[prevWord];

                        if (currWordBigram[currWord] == null)
                            currWordBigram[currWord] = 1;
                        else
                        {
                            currWordBigram[currWord] = ((int)currWordBigram[currWord]) + 1;
                        }

                        prevWord = currWord;
                    }
                }
            }

示例#4

0

显示文件

文件： CentroidAlgorithm2.cs 项目： shereefsakr/arabictext-summarizer

        public override string generateSummary(Document newDoc, double compressionRatio)
        {
            double[] cTotal = new double[newDoc.sentences.Count];
            double[] pTotal = new double[newDoc.sentences.Count];
            double[] fTotal = new double[newDoc.sentences.Count];
            double cMax = double.MinValue;

            ArrayList centroids = buildCentroids(this.trainingDocs, IDF.getInstance());

            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                Sentence currSent = (Sentence)newDoc.sentences[i];

                // Calculate C
                cTotal[i] = 0;
                foreach (string word in currSent.words)
                {
                    /*
                    double tf = termFrequency(docStats, firstWord);
                    double idf = CentroidAlgorithm.idf(docStats, firstWord);
                    cTotal[i] += tf * idf;
                    //*/

                    cTotal[i] += getCentroidValue(centroids, word);
                }

                if (cTotal[i] > cMax)
                    cMax = cTotal[i];

                // Calculate F
                fTotal[i] = 0;

                foreach (string word in currSent.words)
                {
                    int wordOccurence = 0;

                    if (newDoc.title.wordsCount[word] != null)
                    {
                        wordOccurence += ((int)newDoc.title.wordsCount[word]);
                    }

                    if (newDoc.sentences.Count > 1)
                    {
                        if (((Sentence)newDoc.sentences[0]).wordsCount[word] != null)
                        {
                            wordOccurence += ((int)((Sentence)newDoc.sentences[0]).wordsCount[word]);
                        }
                    }

                    fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word]));
                }
            }

            // Calculate P
            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                // Remove + 1 as arrays are zero based.
                pTotal[i] = ((newDoc.sentences.Count - i) * cMax) / newDoc.sentences.Count;
            }

            double maxScore = double.MinValue;

            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]);

                ((Sentence)newDoc.sentences[i]).weight = currWeight;

                if (currWeight > maxScore)
                    maxScore = currWeight;
            }

            string genSummary = null;
            string prevgenSummary = null;

            do
            {
                for (int i = 0; i < newDoc.sentences.Count; i++)
                {
                    for (int j = 0; j < newDoc.sentences.Count; j++)
                    {
                        if (i >= j)
                            continue;

                        double redundancy = redundancyPenalty((Sentence)newDoc.sentences[i], (Sentence)newDoc.sentences[j]);

                        ((Sentence)newDoc.sentences[j]).weight -= (maxScore * redundancy);
                    }
                }

                maxScore = double.MinValue;

                for (int i = 0; i < newDoc.sentences.Count; i++)
                {
                    if (((Sentence)newDoc.sentences[i]).weight > maxScore)
                        maxScore = ((Sentence)newDoc.sentences[i]).weight;
                }

                Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(new Sentence().GetType());

                prevgenSummary = genSummary;

                genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, this.compressionRatio);
            } while (!genSummary.Equals(prevgenSummary));

            return (genSummary);
        }

示例#5

0

显示文件

文件： LakhasAlgorithm.cs 项目： shereefsakr/arabictext-summarizer

        public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent)
        {
            Trace.write(sent.fullText);
            double weight = 0;

            // 1: ScLead
            double sclead = 0;

            if (sent == doc.sentences[0])
                sclead = 2;
            else
                sclead = 1;

            Trace.write("SCLead : " + sclead);

            // 2: ScTitle
            double sctitle = 0;
            foreach (string aWord in sent.words)
            {
                //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal);
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                if (doc.title != null)
                {
                    if (doc.title.words.ToArray().Contains(aWord))
                        sctitle += (2 * tf);
                }
            }

            Trace.write("SCTitle : " + sctitle);

            // 3: sccue
            double sccue = 0;

            foreach (string aWord in sent.words)
            {
                if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord))
                {
                    double tf = termFrequency(sent, aWord);

                    sccue += tf;
                }
            }

            Trace.write("SCCue : " + sccue);

            // 4: sctfidf
            double sctfidf = 0;

            foreach (string aWord in sent.words)
            {
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                //if (docStats.wordRefs[aWord] != null && tf != 0)
                if (tf != 0)
                    //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count));
                    sctfidf += (((tf - 1) / tf) * idf.get(aWord));
            }

            //sctfidf = sctfidf / docStats.sentCount;
            //sctfidf = sctfidf / doc.sentences.Count;
            //sctfidf = sctfidf / sent.words.Length;
            sctfidf = sctfidf / sent.words.Count;

            Trace.write("SCTFIDF : " + sctfidf);

            weight = sclead + sctitle + sccue + sctfidf;

            sent.weight = weight;

            Trace.write("Weight : " + weight);

            return (weight);
        }

示例#6

0

显示文件

文件： LakhasAlgorithm.cs 项目： shereefsakr/arabictext-summarizer

        public override string generateSummary(Document newDoc, double compressionRatio)
        {
            //Document newDoc = Document.process(newDocText);
            //Document newDoc = Conf.getDocumentProcessor().process(newDocText);

            foreach (Sentence aSent in newDoc.sentences)
            {
                calcSentenceWeight(IDF.getInstance(), newDoc, aSent);
            }

            //object[] sents = newDoc.sentences.ToArray();
            Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(typeof(Sentence));

            string genSummary = "";
            genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio);

            /*
            Array.Sort(sents, new SentenceComparer());
            Array.Reverse(sents);

            int numSents = NUM_SENTENCES;
            if (sents.Length < numSents)
                numSents = sents.Length;

            for (int i = 0; i < numSents; i++)
            {
                genSummary += ((Sentence)sents[i]).fullText + "\r\n";
            }
            //*/

            /*
            string dbgString = "";
            foreach (Sentence aSent in sents)
            {
                dbgString += aSent.fullText + "\r\n";
            }

            debugClipboard(dbgString);
            //*/

            return (genSummary);
        }

示例#7

0

显示文件

文件： DocumentProcessor.cs 项目： shereefsakr/arabictext-summarizer

        public virtual Document process(string docText)
        {
            Document doc = new Document();

            doc.originalText = docText;

            // Begin : Preprocessing

            // Remove Extra Characters and Words.
            /*
            docText = Regex.Replace(docText, "\r\n([^\r\n])", "$1", RegexOptions.Multiline);
            docText = Regex.Replace(docText, @"\(يتبع\)", "");
            docText = Regex.Replace(docText, @"\(النهاية\)(.*)", "", RegexOptions.Multiline | RegexOptions.Singleline);
            //*/

            // Normalize Characters
            docText = Regex.Replace(docText, "أ|إ", "ا");
            docText = Regex.Replace(docText, "ى", "ي");

            doc.fullText = docText;
            // End : Preprocessing

            //string match = Regex.Match(docText, @"\s(.*/)+.*\s").Value;

            string[] splits = Regex.Split(docText, @"\.<br>|\.|\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            //debugClipboard(splits);

            ArrayList sentences = new ArrayList();

            foreach (string split in splits)
            {
                string text = split;

                if (text == null)
                    continue;

                if (text.Trim().Equals(""))
                    continue;

                Sentence sent = new Sentence();

                sent.fullText = text;

                text = Regex.Replace(text, @"^\s+", "");
                text = Regex.Replace(text, @"\s+$", "");

                // Remove Stop Words
                text = StopWordsHandler.getInstance(Conf.STOP_WORDS_PATH).remove(text);

                // Lemmatizer
                /*
                Trace.write("Before lemmatization");
                Trace.write(text);
                //*/
                text = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH).replace(text);
                /*
                Trace.write("After lemmatization");
                Trace.write(text);
                //*/

                string[] wordSplits = Regex.Split(text, @"\s+", RegexOptions.IgnorePatternWhitespace);

                //sent.words = wordSplits;

                ArrayList words = new ArrayList();
                Hashtable wordsCount = new Hashtable();
                Regex validWordRegex = new Regex(@"[\u0600-\u06FF\u0750-\u076D]", RegexOptions.Compiled);
                Regex toRemove = new Regex(@"[0-9\u066B\u066C\u060C]", RegexOptions.Compiled);
                int sentOrder = 0;

                foreach (string word in wordSplits)
                {
                    if (!validWordRegex.IsMatch(word))
                        continue;

                    string afterRemoval = toRemove.Replace(word, "");

                    if (afterRemoval.Length < 2)
                        continue;

                    words.Add(afterRemoval);
                    if (wordsCount[afterRemoval] == null)
                        wordsCount[afterRemoval] = 1;
                    else
                        wordsCount[afterRemoval] = (int)wordsCount[afterRemoval] + 1;
                }
                sent.words = words;
                sent.wordsCount = wordsCount;

                // is it a title
                // Compare references not values
                if ((object)split == (object)splits[0])
                    doc.title = sent;
                else
                {
                    sent.order = ++sentOrder;
                    sentences.Add(sent);
                }
            }

            doc.sentences = sentences;

            return doc;
        }

C# (CSharp) TextSummarizationAlgos.DocumentProcessing Document示例