private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { BackgroundWorkerData BGData = (BackgroundWorkerData)e.Argument; //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Loading model..."; }); //set up our sentence boundary detection Regex SentenceSplitter = new Regex(@"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", RegexOptions.Compiled); //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(BGData.TextFileFolder, "*.txt", SearchDepth); try { var tagger = new MaxentTagger(modelsDirectory + @"/" + BGData.SelectedModel); int NumberOfTagsInModel = tagger.numTags(); List <string> tags_list_header = new List <string>(); List <string> tags_list = new List <string>(); for (int i = 0; i < NumberOfTagsInModel; i++) { tags_list_header.Add("\"" + tagger.getTag(i) + "\""); tags_list.Add(tagger.getTag(i)); } tags_list_header.Sort(); tags_list.Sort(); string[] tags_array = tags_list.ToArray(); //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(BGData.OutputFileLocation, FileMode.Create), SelectedEncoding)) { //write the header row to the output file StringBuilder HeaderString = new StringBuilder(); HeaderString.Append("\"Filename\",\"Segment\",\"TokenCount\",\"SentenceCount\"," + string.Join(",", tags_list_header.ToArray())); if (BGData.OutputTaggedText) { HeaderString.Append(",\"TaggedText\""); } if (BGData.OrderedPOSTagText) { HeaderString.Append(",\"OrderedPOSTags\""); } outputFile.WriteLine(HeaderString.ToString()); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //read in the text file, convert everything to lowercase var InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim(); var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(InputText)).toArray(); //now that we know how many sentences we have, we can figure out the segmentation double SentencesPerSegment = 1.0; int NumberOfSegments = BGData.NumSegments; if (NumberOfSegments > sentences.Length) { NumberOfSegments = sentences.Length; } if (sentences.Length > 0) { SentencesPerSegment = sentences.Length / (double)NumberOfSegments; } List <List <ArrayList> > Sentences_Segmented = new List <List <ArrayList> >(); int SegmentCounter = 1; //int SentenceNumberTracker = 0; for (int i = 0; i < sentences.Length; i++) { if (Sentences_Segmented.Count < SegmentCounter) { Sentences_Segmented.Add(new List <ArrayList>()); } Sentences_Segmented[SegmentCounter - 1].Add((ArrayList)sentences[i]); //SentenceNumberTracker++; if (i + 1 >= SegmentCounter * SentencesPerSegment) { SegmentCounter++; //SentenceNumberTracker = 0; } } sentences = null; // _ _ _____ _ // / \ _ __ __ _| |_ _ _______ |_ _|____ _| |_ // / _ \ | '_ \ / _` | | | | |_ / _ \ | |/ _ \ \/ / __| // / ___ \| | | | (_| | | |_| |/ / __/ | | __/> <| |_ // /_/ \_\_| |_|\__,_|_|\__, /___\___| |_|\___/_/\_\\__| // |___/ for (int i = 0; i < NumberOfSegments; i++) { Dictionary <string, int> POSSums = new Dictionary <string, int>(); for (int j = 0; j < NumberOfTagsInModel; j++) { POSSums.Add(tags_array[j], 0); } StringBuilder TaggedText = new StringBuilder(); StringBuilder OrderedPOSTags = new StringBuilder(); int TotalSentences = Sentences_Segmented[i].Count; int TotalWC = 0; foreach (ArrayList sentence in Sentences_Segmented[i]) { var taggedSentence = tagger.tagSentence(sentence); Iterator it = taggedSentence.iterator(); while (it.hasNext()) { TaggedWord token = (TaggedWord)it.next(); if (BGData.OutputTaggedText) { TaggedText.Append(token.toString() + " "); } if (BGData.OrderedPOSTagText) { OrderedPOSTags.Append(token.tag() + " "); } POSSums[token.tag()] += 1; TotalWC += 1; //MessageBox.Show(token.word()); } TaggedText.Append(Environment.NewLine); OrderedPOSTags.Append(Environment.NewLine); } // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| string[] OutputString = new string[4]; OutputString[0] = "\"" + Filename_Clean + "\""; OutputString[1] = (i + 1).ToString(); OutputString[2] = TotalWC.ToString(); OutputString[3] = TotalSentences.ToString(); int include_tagged_text = 0; int include_ordered_pos = 0; if (BGData.OutputTaggedText) { include_tagged_text = 1; } if (BGData.OrderedPOSTagText) { include_ordered_pos = 1; } string[] TagOutputString = new string[NumberOfTagsInModel + include_tagged_text + include_ordered_pos]; for (int j = 0; j < NumberOfTagsInModel; j++) { if (BGData.NormalizeOutput && TotalWC > 0) { TagOutputString[j] = RoundUp(POSSums[tags_array[j]] * 100 / (double)TotalWC, 5).ToString(); } else { TagOutputString[j] = POSSums[tags_array[j]].ToString(); } } if (BGData.OutputTaggedText) { TagOutputString[TagOutputString.Length - include_tagged_text - include_ordered_pos] = "\"" + TaggedText.ToString().Replace("\"", "\"\"") + "\""; } if (BGData.OrderedPOSTagText) { TagOutputString[TagOutputString.Length - include_ordered_pos] = "\"" + OrderedPOSTags.ToString().Replace("\"", "\"\"") + "\""; } outputFile.WriteLine(String.Join(",", MergeOutputArrays(OutputString, TagOutputString))); } //end of the "for each file" loop } } } catch (OutOfMemoryException OOM) { MessageBox.Show("One or more of your files caused an Out of Memory error. This means that you do not have enough RAM to process the current file. This is often caused by extremely complex / messy language samples with run-on sentences or other peculiar constructions, paired with a computer that does not have enough RAM to handle such processing.", "Out of Memory", MessageBoxButtons.OK, MessageBoxIcon.Error); } catch { MessageBox.Show("POSTModern encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while POSTModern is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
public Payload RunPlugin(Payload Input) { //custom class to hold all of the relevant output Payload pData = new Payload(); pData.FileID = Input.FileID; pData.SegmentID = Input.SegmentID; pData.ObjectList = new List <object>(); #region For Each Incoming Text for (int counter = 0; counter < Input.StringList.Count; counter++) { #region Setting Up the Output Object with basic data that we'll work with later TaggerOutputObject TaggerOutput = new TaggerOutputObject(); TaggerOutput.ModelTags = new Dictionary <int, string>(); for (int i = 0; i < NumberOfTagsInModel; i++) { TaggerOutput.ModelTags.Add(i, posTags[i]); TaggerOutput.POSSums.Add(posTags[i], 0); } #endregion #region Segment Sentences, Prepare to Tag var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(Input.StringList[counter])).toArray(); TaggerOutput.TotalSentences = sentences.Length; TaggerOutput.TotalWC = 0; #endregion #region Do the Actual Tagging List <string> TaggedText = new List <string>(); List <string> TaggedText_TagsOnly = new List <string>(); foreach (ArrayList sentence in sentences) { try { var taggedSentence = tagger.tagSentence(sentence); Iterator it = taggedSentence.iterator(); while (it.hasNext()) { TaggedWord token = (TaggedWord)it.next(); TaggedText.Add(token.toString()); TaggedText_TagsOnly.Add(token.tag().ToString()); TaggerOutput.POSSums[token.tag()] += 1; TaggerOutput.TotalWC += 1; } } catch (OutOfMemoryException OOM) { MessageBox.Show("Plugin Error: Core NLP POS Tagger. One or more of your files caused an Out of Memory error. This means that you do not have enough RAM to process the current text. This is often caused by extremely complex / messy language samples with run-on sentences or other peculiar constructions, paired with a computer that does not have enough RAM to handle such processing.", "Plugin Error (Out of Memory): Core NLP POS Tagger", MessageBoxButtons.OK, MessageBoxIcon.Error); } } TaggerOutput.TaggedText = TaggedText.ToArray(); TaggerOutput.TaggedText_TagsOnly = TaggedText_TagsOnly.ToArray(); #endregion pData.ObjectList.Add(TaggerOutput); pData.SegmentNumber.Add(Input.SegmentNumber[counter]); } #endregion return(pData); }