public PosSentence(List taggedSentence)
        {
            Words = new List <PosTaggedWord>();

            var i = taggedSentence.iterator();

            while (i.hasNext())
            {
                TaggedWord x = (TaggedWord)i.next();

                PosTaggedWord word = new PosTaggedWord(x.word(), x.tag());
                Words.Add(word);
            }
        }
Beispiel #2
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            BackgroundWorkerData BGData = (BackgroundWorkerData)e.Argument;


            //report what we're working on
            FilenameLabel.Invoke((MethodInvoker) delegate
            {
                FilenameLabel.Text = "Loading model...";
            });


            //set up our sentence boundary detection
            Regex SentenceSplitter = new Regex(@"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(BGData.TextFileFolder, "*.txt", SearchDepth);



            try {
                var tagger = new MaxentTagger(modelsDirectory + @"/" + BGData.SelectedModel);

                int NumberOfTagsInModel = tagger.numTags();

                List <string> tags_list_header = new List <string>();
                List <string> tags_list        = new List <string>();


                for (int i = 0; i < NumberOfTagsInModel; i++)
                {
                    tags_list_header.Add("\"" + tagger.getTag(i) + "\"");
                    tags_list.Add(tagger.getTag(i));
                }

                tags_list_header.Sort();
                tags_list.Sort();

                string[] tags_array = tags_list.ToArray();



                //open up the output file
                using (StreamWriter outputFile = new StreamWriter(new FileStream(BGData.OutputFileLocation, FileMode.Create), SelectedEncoding))
                {
                    //write the header row to the output file
                    StringBuilder HeaderString = new StringBuilder();
                    HeaderString.Append("\"Filename\",\"Segment\",\"TokenCount\",\"SentenceCount\"," + string.Join(",", tags_list_header.ToArray()));

                    if (BGData.OutputTaggedText)
                    {
                        HeaderString.Append(",\"TaggedText\"");
                    }
                    if (BGData.OrderedPOSTagText)
                    {
                        HeaderString.Append(",\"OrderedPOSTags\"");
                    }

                    outputFile.WriteLine(HeaderString.ToString());


                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);


                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //read in the text file, convert everything to lowercase
                        var InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim();

                        var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(InputText)).toArray();



                        //now that we know how many sentences we have, we can figure out the segmentation
                        double SentencesPerSegment = 1.0;
                        int    NumberOfSegments    = BGData.NumSegments;
                        if (NumberOfSegments > sentences.Length)
                        {
                            NumberOfSegments = sentences.Length;
                        }

                        if (sentences.Length > 0)
                        {
                            SentencesPerSegment = sentences.Length / (double)NumberOfSegments;
                        }


                        List <List <ArrayList> > Sentences_Segmented = new List <List <ArrayList> >();

                        int SegmentCounter = 1;
                        //int SentenceNumberTracker = 0;
                        for (int i = 0; i < sentences.Length; i++)
                        {
                            if (Sentences_Segmented.Count < SegmentCounter)
                            {
                                Sentences_Segmented.Add(new List <ArrayList>());
                            }

                            Sentences_Segmented[SegmentCounter - 1].Add((ArrayList)sentences[i]);
                            //SentenceNumberTracker++;

                            if (i + 1 >= SegmentCounter * SentencesPerSegment)
                            {
                                SegmentCounter++;
                                //SentenceNumberTracker = 0;
                            }
                        }


                        sentences = null;



                        //     _                _                 _____         _
                        //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                        //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                        //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                        // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                        //                        |___/



                        for (int i = 0; i < NumberOfSegments; i++)
                        {
                            Dictionary <string, int> POSSums = new Dictionary <string, int>();
                            for (int j = 0; j < NumberOfTagsInModel; j++)
                            {
                                POSSums.Add(tags_array[j], 0);
                            }


                            StringBuilder TaggedText     = new StringBuilder();
                            StringBuilder OrderedPOSTags = new StringBuilder();

                            int TotalSentences = Sentences_Segmented[i].Count;
                            int TotalWC        = 0;


                            foreach (ArrayList sentence in Sentences_Segmented[i])
                            {
                                var taggedSentence = tagger.tagSentence(sentence);


                                Iterator it = taggedSentence.iterator();



                                while (it.hasNext())
                                {
                                    TaggedWord token = (TaggedWord)it.next();

                                    if (BGData.OutputTaggedText)
                                    {
                                        TaggedText.Append(token.toString() + " ");
                                    }
                                    if (BGData.OrderedPOSTagText)
                                    {
                                        OrderedPOSTags.Append(token.tag() + " ");
                                    }


                                    POSSums[token.tag()] += 1;
                                    TotalWC += 1;

                                    //MessageBox.Show(token.word());
                                }

                                TaggedText.Append(Environment.NewLine);
                                OrderedPOSTags.Append(Environment.NewLine);
                            }



                            // __        __    _ _          ___        _               _
                            // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                            //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                            //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                            //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                            //                                            |_|



                            string[] OutputString = new string[4];
                            OutputString[0] = "\"" + Filename_Clean + "\"";
                            OutputString[1] = (i + 1).ToString();
                            OutputString[2] = TotalWC.ToString();
                            OutputString[3] = TotalSentences.ToString();

                            int include_tagged_text = 0;
                            int include_ordered_pos = 0;
                            if (BGData.OutputTaggedText)
                            {
                                include_tagged_text = 1;
                            }
                            if (BGData.OrderedPOSTagText)
                            {
                                include_ordered_pos = 1;
                            }

                            string[] TagOutputString = new string[NumberOfTagsInModel + include_tagged_text + include_ordered_pos];

                            for (int j = 0; j < NumberOfTagsInModel; j++)
                            {
                                if (BGData.NormalizeOutput && TotalWC > 0)
                                {
                                    TagOutputString[j] = RoundUp(POSSums[tags_array[j]] * 100 / (double)TotalWC, 5).ToString();
                                }
                                else
                                {
                                    TagOutputString[j] = POSSums[tags_array[j]].ToString();
                                }
                            }

                            if (BGData.OutputTaggedText)
                            {
                                TagOutputString[TagOutputString.Length - include_tagged_text - include_ordered_pos] = "\"" + TaggedText.ToString().Replace("\"", "\"\"") + "\"";
                            }
                            if (BGData.OrderedPOSTagText)
                            {
                                TagOutputString[TagOutputString.Length - include_ordered_pos] = "\"" + OrderedPOSTags.ToString().Replace("\"", "\"\"") + "\"";
                            }

                            outputFile.WriteLine(String.Join(",", MergeOutputArrays(OutputString, TagOutputString)));
                        }



                        //end of the "for each file" loop
                    }
                }
            }
            catch (OutOfMemoryException OOM)
            {
                MessageBox.Show("One or more of your files caused an Out of Memory error. This means that you do not have enough RAM to process the current file. This is often caused by extremely complex / messy language samples with run-on sentences or other peculiar constructions, paired with a computer that does not have enough RAM to handle such processing.", "Out of Memory", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
            catch
            {
                MessageBox.Show("POSTModern encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while POSTModern is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
        public Payload RunPlugin(Payload Input)
        {
            //custom class to hold all of the relevant output

            Payload pData = new Payload();

            pData.FileID     = Input.FileID;
            pData.SegmentID  = Input.SegmentID;
            pData.ObjectList = new List <object>();



            #region For Each Incoming Text
            for (int counter = 0; counter < Input.StringList.Count; counter++)
            {
                #region Setting Up the Output Object with basic data that we'll work with later
                TaggerOutputObject TaggerOutput = new TaggerOutputObject();
                TaggerOutput.ModelTags = new Dictionary <int, string>();


                for (int i = 0; i < NumberOfTagsInModel; i++)
                {
                    TaggerOutput.ModelTags.Add(i, posTags[i]);
                    TaggerOutput.POSSums.Add(posTags[i], 0);
                }
                #endregion


                #region Segment Sentences, Prepare to Tag
                var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(Input.StringList[counter])).toArray();
                TaggerOutput.TotalSentences = sentences.Length;
                TaggerOutput.TotalWC        = 0;
                #endregion


                #region Do the Actual Tagging

                List <string> TaggedText          = new List <string>();
                List <string> TaggedText_TagsOnly = new List <string>();

                foreach (ArrayList sentence in sentences)
                {
                    try {
                        var taggedSentence = tagger.tagSentence(sentence);

                        Iterator it = taggedSentence.iterator();

                        while (it.hasNext())
                        {
                            TaggedWord token = (TaggedWord)it.next();
                            TaggedText.Add(token.toString());
                            TaggedText_TagsOnly.Add(token.tag().ToString());
                            TaggerOutput.POSSums[token.tag()] += 1;
                            TaggerOutput.TotalWC += 1;
                        }
                    }
                    catch (OutOfMemoryException OOM)
                    {
                        MessageBox.Show("Plugin Error: Core NLP POS Tagger. One or more of your files caused an Out of Memory error. This means that you do not have enough RAM to process the current text. This is often caused by extremely complex / messy language samples with run-on sentences or other peculiar constructions, paired with a computer that does not have enough RAM to handle such processing.", "Plugin Error (Out of Memory): Core NLP POS Tagger", MessageBoxButtons.OK, MessageBoxIcon.Error);
                    }
                }
                TaggerOutput.TaggedText          = TaggedText.ToArray();
                TaggerOutput.TaggedText_TagsOnly = TaggedText_TagsOnly.ToArray();
                #endregion

                pData.ObjectList.Add(TaggerOutput);
                pData.SegmentNumber.Add(Input.SegmentNumber[counter]);
            }
            #endregion

            return(pData);
        }