Beispiel #1
0
        //  _____      _                 _   _____        _          ______ _ _
        // |  __ \    | |               | | |  __ \      | |        |  ____(_) |
        // | |__) |___| | ___   __ _  __| | | |  | | __ _| |_ __ _  | |__   _| | ___
        // |  _  // _ \ |/ _ \ / _` |/ _` | | |  | |/ _` | __/ _` | |  __| | | |/ _ \
        // | | \ \  __/ | (_) | (_| | (_| | | |__| | (_| | || (_| | | |    | | |  __/
        // |_|  \_\___|_|\___/ \__,_|\__,_| |_____/ \__,_|\__\__,_| |_|    |_|_|\___|



        private void ReloadCSVButton_Click(object sender, EventArgs e)
        {
            ColumnNameCheckedListbox.Items.Clear();

            FilenameLabel.Text = "Clearing old preview... (This might take a while for previews with a large number of columns.)";
            FilenameLabel.Invalidate();
            FilenameLabel.Update();
            FilenameLabel.Refresh();

            dataGridView1.DataSource = null;
            FilenameLabel.Text       = "Ready to load a data file preview.";

            if (FilenameDisplayBox.Text != "No file selected...")
            {
                DisableButtons();
                BgWorkerInformation BgData = new BgWorkerInformation();

                BgData.InputFile   = FilenameDisplayBox.Text;
                BgData.HasHeaders  = HeaderRowDropdown.SelectedItem.ToString();
                BgData.Delimiters  = DelimiterTextBox.Text.ToString();
                BgData.UsingQuotes = EnclosedInQuotesDropdown.SelectedItem.ToString();

                LoadCSVPreview_BGWorker.RunWorkerAsync(BgData);
            }
        }
Beispiel #2
0
        //  _                     _   _____        _          ____        _   _
        // | |                   | | |  __ \      | |        |  _ \      | | | |
        // | |     ___   __ _  __| | | |  | | __ _| |_ __ _  | |_) |_   _| |_| |_ ___  _ __
        // | |    / _ \ / _` |/ _` | | |  | |/ _` | __/ _` | |  _ <| | | | __| __/ _ \| '_ \ 
        // | |___| (_) | (_| | (_| | | |__| | (_| | || (_| | | |_) | |_| | |_| || (_) | | | |
        // |______\___/ \__,_|\__,_| |_____/ \__,_|\__\__,_| |____/ \__,_|\__|\__\___/|_| |_|
        //


        private void GeneratePreviewButton_Click(object sender, EventArgs e)
        {
            FirstColumnComboBox.Items.Clear();
            LastColumnComboBox.Items.Clear();
            TokenColumnComboBox.Items.Clear();

            FilenameDisplayBox.Text = "No file selected...";

            FilenameLabel.Text = "Clearing old preview... (This might take a while for previews with a large number of columns.)";
            FilenameLabel.Invalidate();
            FilenameLabel.Update();
            FilenameLabel.Refresh();

            dataGridView1.DataSource = null;
            FilenameLabel.Text       = "Ready to load a data file preview.";

            openFileDialog.Title = "Please select you data file...";

            DialogResult InputFileDialog = openFileDialog.ShowDialog();

            if (InputFileDialog != DialogResult.Cancel)
            {
                DisableButtons();
                string InputFile = openFileDialog.FileName;

                FilenameDisplayBox.Text = InputFile;

                FilenameDisplayBox.Focus();
                // Move the caret to the end of the text box
                FilenameDisplayBox.Select(FilenameDisplayBox.Text.Length, 0);


                BgWorkerInformation BgData = new BgWorkerInformation();

                BgData.InputFile   = FilenameDisplayBox.Text;
                BgData.HasHeaders  = HeaderRowDropdown.SelectedItem.ToString();
                BgData.Delimiters  = DelimiterTextBox.Text.ToString();
                BgData.UsingQuotes = EnclosedInQuotesDropdown.SelectedItem.ToString();

                LoadCSVPreview_BGWorker.RunWorkerAsync(BgData);
            }
            else
            {
                FilenameDisplayBox.Text = "No file selected...";
                StartButton.Enabled     = false;
                ReloadCSVButton.Enabled = false;
                FirstColumnComboBox.Items.Clear();
                LastColumnComboBox.Items.Clear();
                TokenColumnComboBox.Items.Clear();
            }
        }
        void ReleaseDesignerOutlets()
        {
            if (FilenameLabel != null)
            {
                FilenameLabel.Dispose();
                FilenameLabel = null;
            }

            if (ImageView != null)
            {
                ImageView.Dispose();
                ImageView = null;
            }
        }
Beispiel #4
0
        private void LoadCSVPreview_BGWorker_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
        {
            FilenameLabel.Text = "Please wait while preview is being generated... (This might take a while for files with a large number of columns.)";
            FilenameLabel.Invalidate();
            FilenameLabel.Update();
            FilenameLabel.Refresh();
            Application.DoEvents();

            //bind the results to the datagridview
            try {
                dataGridView1.DataSource = e.Result;
                EnableButtons();
                ReloadCSVButton.Enabled = true;
                StartButton.Enabled     = true;
                MessageBox.Show("Your data file preview has been loaded." + "\r\n\r\n" +
                                "If your preview window appears to be empty, you most likely need to edit your settings under the \"Options for Reading Data File\" section.", "Preview Loaded", MessageBoxButtons.OK, MessageBoxIcon.Information);

                foreach (DataGridViewColumn column in dataGridView1.Columns)
                {
                    TokenColumnComboBox.Items.Add(column.HeaderText);
                    FirstColumnComboBox.Items.Add(column.HeaderText);
                    LastColumnComboBox.Items.Add(column.HeaderText);
                }

                TokenColumnComboBox.SelectedIndex = 0;
                FirstColumnComboBox.SelectedIndex = 0;
                LastColumnComboBox.SelectedIndex  = LastColumnComboBox.Items.Count - 1;
            }
            catch
            {
                ReloadCSVButton.Enabled = false;
                StartButton.Enabled     = false;

                MessageBox.Show("Your spreadsheet file could not be properly parsed" + "\r\n" +
                                "with the current settings. Please make sure that the" + "\r\n" +
                                "file is not open elsewhere, check your settings, and" + "\r\n" +
                                "try again.", "Data Parse Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
            FilenameLabel.Text = "Finished creating dataset preview.";
        }
Beispiel #5
0
        //   _____                           _         _____                _
        //  / ____|                         | |       |  __ \              (_)
        // | |  __  ___ _ __   ___ _ __ __ _| |_ ___  | |__) | __ _____   ___  _____      __
        // | | |_ |/ _ \ '_ \ / _ \ '__/ _` | __/ _ \ |  ___/ '__/ _ \ \ / / |/ _ \ \ /\ / /
        // | |__| |  __/ | | |  __/ | | (_| | ||  __/ | |   | | |  __/\ V /| |  __/\ V  V /
        //  \_____|\___|_| |_|\___|_|  \__,_|\__\___| |_|   |_|  \___| \_/ |_|\___| \_/\_/
        //



        private void LoadCSVPreview_BGWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //here, we're basically unpacking and redefining all of the core information that was
            //passed to the background worker. it's a bit redundant and not super efficient, but the
            //loss of efficiency is more than made up for by the gains in readability

            BgWorkerInformation BgData = (BgWorkerInformation)e.Argument;

            Encoding SelectedEncoding = null;

            string InputFile  = BgData.InputFile;
            bool   HasHeaders = Convert.ToBoolean(BgData.HasHeaders);

            string[] Delimiters  = BgData.Delimiters.ToCharArray().Select(c => c.ToString()).ToArray();;
            bool     UsingQuotes = Convert.ToBoolean(BgData.UsingQuotes);



            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });


            // a data table we'll use to hold the parsed data
            DataTable dt = new DataTable();


            try
            {
                // create the parser
                using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding))
                {
                    // set the parser variables
                    parser.TrimWhiteSpace = true;
                    parser.TextFieldType  = FieldType.Delimited;
                    parser.SetDelimiters(Delimiters);
                    parser.HasFieldsEnclosedInQuotes = UsingQuotes;

                    int  LineNumber = 0;
                    bool firstLine  = true;

                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Preparing to read data file for preview...";
                    });


                    while (!parser.EndOfData)
                    {
                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Loading data file for preview... Data Row #" + LineNumber.ToString();
                        });


                        //Processing row
                        string[] fields = parser.ReadFields();

                        LineNumber++;

                        // get the column headers
                        if (firstLine)
                        {
                            firstLine = false;

                            if (HasHeaders)
                            {
                                foreach (var val in fields)
                                {
                                    dt.Columns.Add(val);
                                }
                                LineNumber--;
                                continue;
                            }
                            else
                            {
                                for (int i = 1; i <= fields.Length; i++)
                                {
                                    dt.Columns.Add("v" + i.ToString());
                                }
                            }
                        }


                        // get the row data
                        dt.Rows.Add(fields);

                        if (LineNumber > 999)
                        {
                            break;
                        }
                    }
                }

                e.Result = dt;

                if (dt.Columns.Count < 1 || dt.Rows.Count < 1)
                {
                    MessageBox.Show("Your spreadsheet file could not be properly parsed" + "\r\n" +
                                    "with the current settings. WELP could not find any" + "\r\n" +
                                    "distinct columns and/or rows in your data file. This is" + "\r\n" +
                                    "most often caused by using the wrong delimiter(s).", "Data Parse Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                }
            }
            catch
            {
                //what to do if there's an error
                e.Result = false;
            }
        }
Beispiel #6
0
        // __          __   _ _          ____        _               _     ______ _ _
        // \ \        / /  (_) |        / __ \      | |             | |   |  ____(_) |
        //  \ \  /\  / / __ _| |_ ___  | |  | |_   _| |_ _ __  _   _| |_  | |__   _| | ___  ___
        //   \ \/  \/ / '__| | __/ _ \ | |  | | | | | __| '_ \| | | | __| |  __| | | |/ _ \/ __|
        //    \  /\  /| |  | | ||  __/ | |__| | |_| | |_| |_) | |_| | |_  | |    | | |  __/\__ \
        //     \/  \/ |_|  |_|\__\___|  \____/ \__,_|\__| .__/ \__,_|\__| |_|    |_|_|\___||___/
        //                                              | |
        //                                              |_|



        private void BgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //here, we're basically unpacking and redefining all of the core information that was
            //passed to the background worker. it's a bit redundant and not super efficient, but the
            //loss of efficiency is more than made up for by the gains in readability

            BgWorkerInformation BgData = (BgWorkerInformation)e.Argument;

            Encoding SelectedEncoding = null;

            string InputFile  = BgData.InputFile;
            bool   HasHeaders = Convert.ToBoolean(BgData.HasHeaders);

            string[] Delimiters  = BgData.Delimiters.ToCharArray().Select(c => c.ToString()).ToArray();;
            bool     UsingQuotes = Convert.ToBoolean(BgData.UsingQuotes);

            bool DumpOutputAsTXT = false;



            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
                DumpOutputAsTXT  = DumpAsTextCheckbox.Checked;
            });

            string OutputFile = BgData.OutputLocation + Path.DirectorySeparatorChar + "_SLIM_" + Path.GetFileName(InputFile);

            if (DumpOutputAsTXT)
            {
                OutputFile += ".txt";
            }

            try {
                // create the parser
                using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding))
                {
                    // set the parser properties
                    parser.TrimWhiteSpace = true;     //trim the whitespace to make sure that files/folder names don't end with a space, which will break the program
                    parser.TextFieldType  = FieldType.Delimited;
                    parser.SetDelimiters(Delimiters);
                    parser.HasFieldsEnclosedInQuotes = UsingQuotes;


                    bool  firstLine  = true;
                    ulong LineNumber = 0;
                    ulong FileNumber = 0;
                    ulong LastFileNumberforFolderCreation = 0;
                    ulong FolderNumber = 0;

                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Preparing to write output files...";
                    });



                    using (FileStream fileStream = new FileStream(OutputFile, FileMode.Create, FileAccess.Write, FileShare.Read))
                        using (StreamWriter streamWriter = new StreamWriter(fileStream, SelectedEncoding))
                        {
                            //Loop through each row of the dataset
                            while (!parser.EndOfData && !BgWorker.CancellationPending)
                            {
                                //parse out the row
                                string[] fields = parser.ReadFields();

                                LineNumber++;

                                //report what row we're working on
                                if (LineNumber % 10 == 0)
                                {
                                    FilenameLabel.Invoke((MethodInvoker) delegate
                                    {
                                        FilenameLabel.Text = "Currently writing row #" + LineNumber.ToString();
                                    });
                                }



                                //prepare our output to write
                                string[] output_array = new string[BgData.NumberOfColumns];

                                for (int i = 0; i < BgData.NumberOfColumns; i++)

                                {
                                    if (UsingQuotes && DumpOutputAsTXT == false)
                                    {
                                        output_array[i] = '"' + fields[BgData.KeepCols[i]].Replace("\"", "\"\"") + '"';
                                    }
                                    else
                                    {
                                        output_array[i] = fields[BgData.KeepCols[i]];
                                    }
                                }

                                if (DumpOutputAsTXT)
                                {
                                    streamWriter.WriteLine(string.Join("\r\n", output_array));
                                }
                                else
                                {
                                    streamWriter.WriteLine(string.Join(Delimiters[0], output_array));
                                }



                                //write our output


                                if (e.Cancel)
                                {
                                    break;
                                }
                            }
                        }
                }

                e.Result = null;
            }
            catch
            {
                MessageBox.Show("SlimCSV has encountered an error while processing your file.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                e.Result = "error";
            }
        }
Beispiel #7
0
        private void BgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth);



            try {
                string outputdir = Path.Combine(((string[])e.Argument)[1]);

                Directory.CreateDirectory(outputdir);



                foreach (string fileName in files)
                {
                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);



                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                    });



                    //do stuff here
                    string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower();

                    var TokenResults = TwitterKoreanProcessorCS.Tokenize(readText);

                    StringBuilder Builder = new StringBuilder();

                    int tokenCount = TokenResults.Count();

                    for (int i = 0; i < tokenCount; i++)
                    {
                        if (TokenResults.ElementAt(i).Pos != KoreanPos.Space)
                        {
                            Builder.Append(TokenResults.ElementAt(i).Text + ' ');
                        }
                    }

                    using (System.IO.StreamWriter fileout =
                               new StreamWriter(Path.Combine(outputdir, Filename_Clean), false, SelectedEncoding))
                    {
                        fileout.Write(Builder.ToString());
                    }
                }
            }
            catch
            {
                MessageBox.Show("KoToken encountered a problem while trying to tokenize/write a file.");
            }
        }
Beispiel #8
0
        private void BgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //set up our sentence boundary detection
            Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding     = null;
            bool     SpeakerMultipleLines = false;
            bool     UsingRegex           = false;
            string   RegExString          = "";

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding     = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
                SpeakerMultipleLines = SpeakersMultipleLinesCheckbox.Checked;
                RegExString          = RegexTextBox.Text;
            });


            Regex CompiledRegex = new Regex(RegExString, RegexOptions.Compiled);

            if (!string.IsNullOrEmpty(RegExString))
            {
                UsingRegex = true;
            }



            //the very first thing that we want to do is set up our speaker list

            string[] SpeakerList = NewlineClean.Split(SpeakerListTextBox.Text);

            //if we want things to be case-insensitive, this is what we'd do:
            //string[] SpeakerListList = NewlineClean.Split(SpeakerListTextBox.Text.ToLower());

            //remove blanks
            SpeakerList = SpeakerList.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();



            int SpeakerListLength = SpeakerList.Length;


            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth);

            string outputFolder = System.IO.Path.Combine(((string[])e.Argument)[1], "ConverSplitter_Output");


            try
            {
                System.IO.Directory.CreateDirectory(outputFolder);
            }
            catch
            {
                MessageBox.Show("ConverSplitterPlus could not create your output folder.\r\nIs your output directory write protected?");
                e.Cancel = true;
            }



            try
            {
                foreach (string fileName in files)
                {
                    string outputFolder_Subs = "";

                    if (SearchDepth == SearchOption.AllDirectories)
                    {
                        string subfolder = Path.GetDirectoryName(fileName).Replace(((string[])e.Argument)[0], "");
                        outputFolder_Subs = System.IO.Path.Combine(outputFolder, subfolder.Trim(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar));
                        try
                        {
                            System.IO.Directory.CreateDirectory(outputFolder_Subs);
                        }
                        catch
                        {
                            MessageBox.Show("ConverSplitterPlus could not create a subdirectory in your output folder.\r\nIs your output directory write protected?");
                            e.Cancel = true;
                        }
                    }



                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);

                    Dictionary <string, string> Text_Split = new Dictionary <string, string>();


                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                    });


                    //do stuff here
                    string[] readText_Lines = NewlineClean.Split(File.ReadAllText(fileName, SelectedEncoding));

                    int NumberOfLines = readText_Lines.Length;


                    //loop through all of the lines in each text

                    string PreviousSpeaker = "";

                    for (int i = 0; i < NumberOfLines; i++)
                    {
                        string CurrentLine = readText_Lines[i];

                        if (UsingRegex)
                        {
                            CurrentLine = CompiledRegex.Replace(CurrentLine, "").Trim();
                        }
                        else
                        {
                            CurrentLine = CurrentLine.Trim();
                        }


                        //if the line is empty, move along... move along
                        if (CurrentLine.Length == 0)
                        {
                            continue;
                        }

                        bool FoundSpeaker = false;

                        //loop through each speaker in list to see if the line starts with their name
                        for (int j = 0; j < SpeakerListLength; j++)
                        {
                            // here's what we do if we find a match
                            if (CurrentLine.StartsWith(SpeakerList[j]))
                            {
                                FoundSpeaker    = true;
                                PreviousSpeaker = SpeakerList[j];

                                //clean up the line to remove the speaker tag from the beginning
                                int Place = CurrentLine.IndexOf(SpeakerList[j]);
                                CurrentLine = CurrentLine.Remove(Place, SpeakerList[j].Length).Insert(Place, "").Trim() + "\r\n";

                                if (Text_Split.ContainsKey(SpeakerList[j]))
                                {
                                    Text_Split[SpeakerList[j]] += CurrentLine;
                                }

                                else
                                {
                                    Text_Split.Add(SpeakerList[j], CurrentLine);
                                }

                                //break to the next line in the text
                                break;
                            }
                        }


                        //what we will do if no speaker was found
                        if ((FoundSpeaker == false) && (PreviousSpeaker != ""))
                        {
                            if (SpeakerMultipleLines)
                            {
                                Text_Split[PreviousSpeaker] += CurrentLine.Trim() + "\r\n";
                            }
                        }

                        //end of for loop through each line
                    }



                    //here's where we want to write the output! hooray!
                    foreach (KeyValuePair <string, string> entry in Text_Split)
                    {
                        string OutputFilename = Path.GetFileNameWithoutExtension(fileName) + ";" + entry.Key + ".txt";

                        //clean up broken filenames
                        foreach (var c in Path.GetInvalidFileNameChars())
                        {
                            OutputFilename = OutputFilename.Replace(c, '_');
                        }

                        //set the full path of our output
                        if (SearchDepth == SearchOption.AllDirectories)
                        {
                            OutputFilename = System.IO.Path.Combine(outputFolder_Subs, OutputFilename);
                        }
                        else
                        {
                            OutputFilename = System.IO.Path.Combine(outputFolder, OutputFilename);
                        }


                        // write the output
                        using (StreamWriter outputFile = new StreamWriter(new FileStream(OutputFilename, FileMode.Create, FileAccess.Write), SelectedEncoding))
                        {
                            outputFile.Write(entry.Value);
                        }
                    }


                    //end of for loop through each file
                }



                //end of try block
            }
            catch
            {
                MessageBox.Show("ConverSplitterPlus could not open your output file\r\nfor writing. Is the file open in another application?");
            }
        }
Beispiel #9
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            Vocabulate.DictionaryData DictData  = (Vocabulate.DictionaryData)e.Argument;
            TwitterAwareTokenizer     Tokenizer = new TwitterAwareTokenizer();

            Tokenizer.Initialize_Regex();
            Vocabulate.StopWordRemover StopList = new Vocabulate.StopWordRemover();
            StopList.BuildStopList(DictData.StopListRawText);

            //sets up how many columns we're using for output
            short OutputColumnsModifier = 2;

            if (DictData.RawWordCounts)
            {
                OutputColumnsModifier = 4;
            }
            short OutputCapturedText = 0;

            if (DictData.OutputCapturedText)
            {
                OutputCapturedText = 1;
            }


            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth);

            string CSVQuote     = DictData.CSVQuote.ToString();
            string CSVDelimiter = DictData.CSVDelimiter.ToString();

            try {
                //open up the output file
                using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding))
                {
                    short NumberOfHeaderLeadingColumns = 9;

                    //write the header row to the output file
                    StringBuilder HeaderString = new StringBuilder();
                    HeaderString.Append(CSVQuote + "Filename" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "WC" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "TC_Raw" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "TTR_Raw" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "TC_Clean" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "TTR_Clean" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "TC_NonDict" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "TTR_NonDict" + CSVQuote + CSVDelimiter +
                                        CSVQuote + "DictPercent" + CSVQuote);


                    //output headers for the Concept-constrained Concept-Word Ratio (CWR)
                    for (int i = 0; i < DictData.NumCats; i++)
                    {
                        HeaderString.Append(CSVDelimiter + CSVQuote +
                                            DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_CWR" +
                                            CSVQuote);
                    }


                    //output headers for the Concept-Category Ratio (CCR)
                    for (int i = 0; i < DictData.NumCats; i++)
                    {
                        HeaderString.Append(CSVDelimiter + CSVQuote +
                                            DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_CCR" +
                                            CSVQuote);
                    }

                    //if they want the raw category counts, then we add those to the header as well
                    if (DictData.RawWordCounts)
                    {
                        for (int i = 0; i < DictData.NumCats; i++)
                        {
                            HeaderString.Append(CSVDelimiter + CSVQuote +
                                                DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_Count" +
                                                CSVQuote);
                        }
                        for (int i = 0; i < DictData.NumCats; i++)
                        {
                            HeaderString.Append(CSVDelimiter + CSVQuote +
                                                DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_Unique" +
                                                CSVQuote);
                        }
                    }

                    if (DictData.OutputCapturedText)
                    {
                        HeaderString.Append(CSVDelimiter + CSVQuote + "CapturedText" + CSVQuote);
                    }

                    outputFile.WriteLine(HeaderString.ToString());


                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);
                        Dictionary <string, ulong> DictionaryResults = new Dictionary <string, ulong>();
                        foreach (string Concept in DictData.ConceptMap.Keys)
                        {
                            DictionaryResults.Add(Concept, 0);
                        }

                        //structure of DictionaryResults will look like this:

                        //Concept -> Total

                        //this will make it far easier to go through and calculate number of unique concepts divided by total number of words
                        //at the top level categories down the road



                        //for (int i = 0; i < DictData.NumCats; i++) DictionaryResults.Add(DictData.CatValues[i], 0);

                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //read in the text file, convert everything to lowercase
                        string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower();



                        int NumberOfMatches = 0;

                        int WordCount_WhitespaceTokenizer = Tokenizer.TokenizeWhitespace(readText.Trim()).Length;

                        //splits everything out into words
                        string[] Words = Tokenizer.tokenize(readText.Trim());
                        Words = StopList.ClearStopWords(Words);

                        int    TotalStringLength_BeforeStopList = Words.Length;
                        double TTR_Raw = (Words.Distinct().Count() / (double)TotalStringLength_BeforeStopList) * 100;


                        Words = Words.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
                        int    TotalStringLength_AfterStopList = Words.Length;
                        double TTR_Clean = (Words.Distinct().Count() / (double)TotalStringLength_AfterStopList) * 100;

                        StringBuilder CapturedText = new StringBuilder();

                        List <string> NonmatchedTokens = new List <string>();


                        //     _                _                 _____         _
                        //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                        //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                        //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                        // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                        //                        |___/


                        //iterate over all words in the text file
                        for (int i = 0; i < TotalStringLength_AfterStopList; i++)
                        {
                            bool TokenMatched = false;
                            //iterate over n-grams, starting with the largest possible n-gram (derived from the user's dictionary file)
                            for (int NumberOfWords = DictData.MaxWords; NumberOfWords > 0; NumberOfWords--)
                            {
                                //make sure that we don't overextend past the array
                                if (i + NumberOfWords - 1 >= TotalStringLength_AfterStopList)
                                {
                                    continue;
                                }

                                //make the target string

                                string TargetString;

                                if (NumberOfWords > 1)
                                {
                                    TargetString = String.Join(" ", Words.Skip(i).Take(NumberOfWords).ToArray());
                                }
                                else
                                {
                                    TargetString = Words[i];
                                }


                                //look for an exact match

                                if (DictData.FullDictionaryMap["Standards"].ContainsKey(NumberOfWords))
                                {
                                    if (DictData.FullDictionaryMap["Standards"][NumberOfWords].ContainsKey(TargetString))
                                    {
                                        //add in the number of words found
                                        NumberOfMatches += NumberOfWords;

                                        //increment results
                                        DictionaryResults[DictData.FullDictionaryMap["Standards"][NumberOfWords][TargetString]] += 1;


                                        //manually increment the for loop so that we're not testing on words that have already been picked up
                                        i += NumberOfWords - 1;
                                        //break out of the lower level for loop back to moving on to new words altogether
                                        TokenMatched = true;

                                        if (DictData.OutputCapturedText)
                                        {
                                            CapturedText.Append(TargetString.Replace(CSVQuote, CSVQuote + CSVQuote) + " ");
                                        }

                                        break;
                                    }
                                }
                                //if there isn't an exact match, we have to go through the wildcards
                                if (DictData.WildCardArrays.ContainsKey(NumberOfWords))
                                {
                                    for (int j = 0; j < DictData.WildCardArrays[NumberOfWords].Length; j++)
                                    {
                                        if (DictData.PrecompiledWildcards[DictData.WildCardArrays[NumberOfWords][j]].Matches(TargetString).Count > 0)
                                        {
                                            //add in the number of words found
                                            NumberOfMatches += NumberOfWords;

                                            //increment results
                                            DictionaryResults[DictData.FullDictionaryMap["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]]] += 1;

                                            //manually increment the for loop so that we're not testing on words that have already been picked up
                                            i += NumberOfWords - 1;
                                            //break out of the lower level for loop back to moving on to new words altogether
                                            TokenMatched = true;

                                            if (DictData.OutputCapturedText)
                                            {
                                                CapturedText.Append(TargetString.Replace(CSVQuote, CSVQuote + CSVQuote) + " ");
                                            }

                                            break;
                                        }
                                    }
                                }
                            }

                            //this is what we do if we didn't find any match in our dictionary
                            if (!TokenMatched)
                            {
                                NonmatchedTokens.Add(Words[i]);
                            }
                        }



                        // __        __    _ _          ___        _               _
                        // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                        //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                        //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                        //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                        //                                            |_|



                        string[] OutputString = new string[NumberOfHeaderLeadingColumns + (DictData.NumCats * OutputColumnsModifier) + OutputCapturedText];

                        for (int i = 0; i < OutputString.Length; i++)
                        {
                            OutputString[i] = "";
                        }


                        OutputString[0] = CSVQuote + Filename_Clean + CSVQuote;        //filename
                        OutputString[1] = WordCount_WhitespaceTokenizer.ToString();    //WordCount
                        OutputString[2] = TotalStringLength_BeforeStopList.ToString(); //total number of words
                        if (TotalStringLength_BeforeStopList > 0)
                        {
                            OutputString[3] = TTR_Raw.ToString();                     //TTR_Raw
                        }
                        OutputString[4] = TotalStringLength_AfterStopList.ToString(); //total number of tokens after stoplist processing
                        if (TotalStringLength_AfterStopList > 0)
                        {
                            OutputString[5] = TTR_Clean.ToString();                                       // TTR_Clean
                        }
                        OutputString[6] = (TotalStringLength_AfterStopList - NumberOfMatches).ToString(); //number of non-dictionary tokens
                        if (NonmatchedTokens.Count() > 0)
                        {
                            OutputString[7] = (((double)NonmatchedTokens.Distinct().Count() / NonmatchedTokens.Count()) * 100).ToString();                               //TTR for non-dictionary words
                        }
                        //calculate and output the results
                        if (TotalStringLength_BeforeStopList > 0)
                        {
                            OutputString[8] = (((double)NumberOfMatches / TotalStringLength_BeforeStopList) * 100).ToString(); //dictpercent


                            //pull together the results here
                            Dictionary <string, ulong[]> CompiledResults = new Dictionary <string, ulong[]>();
                            foreach (string TopLevelCategory in DictData.CatNames)
                            {
                                CompiledResults.Add(TopLevelCategory, new ulong[2] {
                                    0, 0
                                });
                            }

                            foreach (string ConceptKey in DictData.ConceptMap.Keys)
                            {
                                if (DictionaryResults[ConceptKey] > 0)
                                {
                                    for (int i = 0; i < DictData.ConceptMap[ConceptKey].Length; i++)
                                    {
                                        //if the Concept was found in the text, increment the first index (i.e., the number of unique concepts) by 1
                                        CompiledResults[DictData.ConceptMap[ConceptKey][i]][0] += 1;
                                        //if the Concept was found in the text, add the number of times it occurred
                                        CompiledResults[DictData.ConceptMap[ConceptKey][i]][1] += DictionaryResults[ConceptKey];
                                    }
                                }
                            }


                            //this is where we actually calulate and output the CWR scores
                            for (int i = 0; i < DictData.CategoryOrder.Count; i++)
                            {
                                if (WordCount_WhitespaceTokenizer > 0)
                                {
                                    OutputString[i + NumberOfHeaderLeadingColumns] = (((double)CompiledResults[DictData.CategoryOrder[i]][0] / WordCount_WhitespaceTokenizer) * 100.0).ToString();
                                }
                            }

                            //this is where we actually calulate and output the CCR scores
                            for (int i = 0; i < DictData.CategoryOrder.Count; i++)
                            {
                                if (CompiledResults[DictData.CategoryOrder[i]][0] > 0)
                                {
                                    OutputString[i + NumberOfHeaderLeadingColumns + DictData.NumCats] = (((double)CompiledResults[DictData.CategoryOrder[i]][0] / CompiledResults[DictData.CategoryOrder[i]][1]) * 100.0).ToString();
                                }
                            }

                            //this is if the user asked for the raw counts per category
                            if (DictData.RawWordCounts)
                            {
                                for (int i = 0; i < DictData.CategoryOrder.Count; i++)
                                {
                                    OutputString[i + NumberOfHeaderLeadingColumns + (DictData.NumCats * 2)] = CompiledResults[DictData.CategoryOrder[i]][1].ToString();
                                    OutputString[i + NumberOfHeaderLeadingColumns + (DictData.NumCats * 3)] = CompiledResults[DictData.CategoryOrder[i]][0].ToString();
                                }
                            }
                        }
                        else
                        {
                            OutputString[3] = "";
                            for (int i = 0; i < DictData.NumCats; i++)
                            {
                                OutputString[i + NumberOfHeaderLeadingColumns] = "";
                            }
                        }

                        //if we're outputting the captured strings, we do that here
                        if (DictData.OutputCapturedText)
                        {
                            OutputString[OutputString.Length - 1] = CSVQuote + CapturedText.ToString() + CSVQuote;
                        }


                        outputFile.WriteLine(String.Join(CSVDelimiter, OutputString));
                    }
                }
            }
            catch
            {
                MessageBox.Show("Vocabulate encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while Vocabulate is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Beispiel #10
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            DictionaryData DictData = (DictionaryData)e.Argument;


            //report what we're working on
            FilenameLabel.Invoke((MethodInvoker) delegate
            {
                FilenameLabel.Text = "Loading CoreNLP models... please wait...";
            });

            //largely taken from here: https://github.com/sergey-tihon/Stanford.NLP.NET/issues/39
            var jarRoot = @"stanford-corenlp-full-2018-02-27\";
            var props   = new java.util.Properties();

            props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
            props.setProperty("sutime.binders", "0");
            var curDir = Environment.CurrentDirectory;

            Directory.SetCurrentDirectory(Path.Combine(Path.GetDirectoryName(AppDomain.CurrentDomain.BaseDirectory), jarRoot));
            var pipeline = new StanfordCoreNLP(props);



            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth);



            //try
            //{

            //open up the output file
            using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding))
            {
                using (StreamWriter outputFileSentences = new StreamWriter(new FileStream(AddSuffix(DictData.OutputFileLocation, "_Sentences"), FileMode.Create), SelectedEncoding))
                {
                    //write the header row to the output file
                    StringBuilder HeaderString = new StringBuilder();
                    HeaderString.Append("\"Filename\",\"Sentences\",\"Classification\",\"Classification_M\",\"Classification_SD\"");

                    outputFile.WriteLine(HeaderString.ToString());

                    StringBuilder HeaderStringSentence = new StringBuilder();
                    HeaderStringSentence.Append("\"Filename\",\"SentNumber\",\"SentenceText\",\"Classification\",\"Class_Prob\",\"Class_Number\"");
                    outputFileSentences.WriteLine(HeaderStringSentence.ToString());

                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);
                        Dictionary <string, int> DictionaryResults = new Dictionary <string, int>();

                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //read in the text file, convert everything to lowercase
                        string InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim();



                        //     _                _                 _____         _
                        //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                        //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                        //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                        // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                        //                        |___/

                        var annotation = new edu.stanford.nlp.pipeline.Annotation(InputText);
                        pipeline.annotate(annotation);

                        List <double> SentimentValues = new List <double>();

                        var sentences = annotation.get(new CoreAnnotations.SentencesAnnotation().getClass()) as ArrayList;

                        int SentenceCount = 0;

                        foreach (CoreMap sentence in sentences)
                        {
                            SentenceCount++;
                            Tree tree = sentence.get(new SentimentCoreAnnotations.SentimentAnnotatedTree().getClass()) as Tree;

                            //add this sentence to our overall list of sentiment scores
                            SentimentValues.Add(RNNCoreAnnotations.getPredictedClass(tree));

                            // __        __    _ _          ___        _               _
                            // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                            //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                            //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                            //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                            //                                            |_|

                            string[] OutputString_SentenceLevel = new string[6];

                            string Classification = GetClassification((double)RNNCoreAnnotations.getPredictedClass(tree));


                            OutputString_SentenceLevel[0] = "\"" + Filename_Clean + "\"";
                            OutputString_SentenceLevel[1] = SentenceCount.ToString();
                            OutputString_SentenceLevel[2] = "\"" + sentence.ToString().Replace("\"", "\"\"") + "\"";
                            OutputString_SentenceLevel[3] = Classification;
                            OutputString_SentenceLevel[4] = RNNCoreAnnotations.getPredictedClassProb(tree.label()).ToString();
                            OutputString_SentenceLevel[5] = RNNCoreAnnotations.getPredictedClass(tree).ToString();

                            outputFileSentences.WriteLine(String.Join(",", OutputString_SentenceLevel));
                        }



                        //write output at the file level
                        string[] OutputString = new string[5];
                        OutputString[0] = "\"" + Filename_Clean + "\"";
                        OutputString[1] = SentenceCount.ToString();
                        OutputString[2] = GetClassification(SentimentValues.Average());
                        OutputString[3] = SentimentValues.Average().ToString();
                        OutputString[4] = StandardDeviation(SentimentValues).ToString();

                        outputFile.WriteLine(String.Join(",", OutputString));
                    }



                    //this is the closing bracket for the sentence-level "using" filestream
                }

                //this is the closing bracket for the document-level "using" filestream
            }

            //}
            //catch
            //{
            //    MessageBox.Show("Senti-Gent encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while Senti-Gent is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            //}
        }
Beispiel #11
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            DictionaryData DictData = (DictionaryData)e.Argument;

            //set up our sentence boundary detection
            Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth);



            try {
                //open up the output file
                using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding))
                {
                    //write the header row to the output file
                    StringBuilder HeaderString = new StringBuilder();
                    HeaderString.Append("\"Filename\",\"WC\",\"DictPercent\"");
                    for (int i = 0; i < DictData.NumCats; i++)
                    {
                        HeaderString.Append("," + DictData.CatNames[i].Replace("\"", "\"\""));
                    }
                    outputFile.WriteLine(HeaderString.ToString());


                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);
                        Dictionary <string, int> DictionaryResults = new Dictionary <string, int>();
                        for (int i = 0; i < DictData.NumCats; i++)
                        {
                            DictionaryResults.Add(DictData.CatValues[i], 0);
                        }

                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //read in the text file, convert everything to lowercase
                        string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower();
                        readText = NewlineClean.Replace(readText, " ");

                        //remove all the junk punctuation
                        foreach (char c in PunctuationBox.Text)
                        {
                            readText = readText.Replace(c, ' ');
                        }


                        int NumberOfMatches = 0;


                        //splits everything out into words
                        //we're splitting on spaces here principally because we leave it up to the
                        //user to decide what characters they want to remove. we're assuming that
                        //they have removed tabs already (as is set up by default)
                        string[] Words = readText.Trim().Split(' ');

                        Words = Words.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
                        int TotalStringLength = Words.Length;



                        //     _                _                 _____         _
                        //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                        //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                        //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                        // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                        //                        |___/


                        //iterate over all words in the text file
                        for (int i = 0; i < TotalStringLength; i++)
                        {
                            //iterate over n-grams, starting with the largest possible n-gram (derived from the user's dictionary file)
                            for (int NumberOfWords = DictData.MaxWords; NumberOfWords > 0; NumberOfWords--)
                            {
                                //make sure that we don't overextend past the array
                                if (i + NumberOfWords - 1 >= TotalStringLength)
                                {
                                    continue;
                                }

                                //make the target string

                                string TargetString;

                                if (NumberOfWords > 1)
                                {
                                    TargetString = String.Join(" ", Words.Skip(i).Take(NumberOfWords).ToArray());
                                }
                                else
                                {
                                    TargetString = Words[i];
                                }


                                //look for an exact match

                                if (DictData.FullDictionary["Standards"].ContainsKey(NumberOfWords))
                                {
                                    if (DictData.FullDictionary["Standards"][NumberOfWords].ContainsKey(TargetString))
                                    {
                                        NumberOfMatches += NumberOfWords;
                                        //add in the number of words found
                                        for (int j = 0; j < DictData.FullDictionary["Standards"][NumberOfWords][TargetString].Length; j++)
                                        {
                                            if (DictionaryResults.ContainsKey(DictData.FullDictionary["Standards"][NumberOfWords][TargetString][j]))
                                            {
                                                DictionaryResults[DictData.FullDictionary["Standards"][NumberOfWords][TargetString][j]] += NumberOfWords;
                                            }
                                        }
                                        //manually increment the for loop so that we're not testing on words that have already been picked up
                                        i += NumberOfWords - 1;
                                        //break out of the lower level for loop back to moving on to new words altogether
                                        break;
                                    }
                                }
                                //if there isn't an exact match, we have to go through the wildcards
                                if (DictData.WildCardArrays.ContainsKey(NumberOfWords))
                                {
                                    for (int j = 0; j < DictData.WildCardArrays[NumberOfWords].Length; j++)
                                    {
                                        if (DictData.PrecompiledWildcards[DictData.WildCardArrays[NumberOfWords][j]].Matches(TargetString).Count > 0)
                                        {
                                            NumberOfMatches += NumberOfWords;

                                            for (int k = 0; k < DictData.FullDictionary["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]].Length; k++)
                                            {
                                                if (DictionaryResults.ContainsKey(DictData.FullDictionary["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]][k]))
                                                {
                                                    DictionaryResults[DictData.FullDictionary["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]][k]] += NumberOfWords;
                                                }
                                            }
                                            //manually increment the for loop so that we're not testing on words that have already been picked up
                                            i += NumberOfWords - 1;
                                            //break out of the lower level for loop back to moving on to new words altogether
                                            break;
                                        }
                                    }
                                }
                            }
                        }



                        // __        __    _ _          ___        _               _
                        // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                        //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                        //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                        //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                        //                                            |_|



                        string[] OutputString = new string[3 + DictData.NumCats];
                        OutputString[0] = "\"" + Filename_Clean + "\"";
                        OutputString[1] = TotalStringLength.ToString();

                        if (TotalStringLength > 0)
                        {
                            OutputString[2] = (((double)NumberOfMatches / TotalStringLength) * 100).ToString();

                            if (DictData.RawWordCounts)
                            {
                                for (int i = 0; i < DictData.NumCats; i++)
                                {
                                    OutputString[i + 3] = DictionaryResults[DictData.CatValues[i]].ToString();
                                }
                            }
                            else
                            {
                                for (int i = 0; i < DictData.NumCats; i++)
                                {
                                    OutputString[i + 3] = (((double)DictionaryResults[DictData.CatValues[i]] / TotalStringLength) * 100).ToString();
                                }
                            }
                        }
                        else
                        {
                            OutputString[2] = "";
                            for (int i = 0; i < DictData.NumCats; i++)
                            {
                                OutputString[i + 3] = "";
                            }
                        }


                        outputFile.WriteLine(String.Join(",", OutputString));
                    }
                }
            }
            catch
            {
                MessageBox.Show("RIOTLite encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while RIOTLite is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Beispiel #12
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            DictionaryData DictData = (DictionaryData)e.Argument;

            SentimentIntensityAnalyzer VADER = new SentimentIntensityAnalyzer();

            //set up our sentence boundary detection
            Regex SentenceSplitter = new Regex(@"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth);



            try {
                //open up the output file
                using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding))
                {
                    using (StreamWriter outputFileSentences = new StreamWriter(new FileStream(AddSuffix(DictData.OutputFileLocation, "_Sentences"), FileMode.Create), SelectedEncoding))
                    {
                        //write the header row to the output file
                        StringBuilder HeaderString = new StringBuilder();
                        HeaderString.Append("\"Filename\",\"WC\",\"Sentences\",\"Classification\",\"Compound_M\",\"Positive_M\",\"Negative_M\",\"Neutral_M\"");
                        outputFile.WriteLine(HeaderString.ToString());

                        StringBuilder HeaderStringSentence = new StringBuilder();
                        HeaderStringSentence.Append("\"Filename\",\"WC\",\"Sentence\",\"Classification\",\"Compound_M\",\"Positive_M\",\"Negative_M\",\"Neutral_M\"");
                        outputFileSentences.WriteLine(HeaderStringSentence.ToString());

                        foreach (string fileName in files)
                        {
                            //set up our variables to report
                            string Filename_Clean = Path.GetFileName(fileName);
                            Dictionary <string, int> DictionaryResults = new Dictionary <string, int>();

                            //report what we're working on
                            FilenameLabel.Invoke((MethodInvoker) delegate
                            {
                                FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                            });



                            //read in the text file, convert everything to lowercase
                            string InputText = File.ReadAllText(fileName, SelectedEncoding).Trim();



                            string[] Sentences = SentenceSplitter.Split(InputText).Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();

                            int TotalStringLength = InputText.Split().Where(x => !string.IsNullOrWhiteSpace(x)).ToArray().Length;
                            int TotalSentences    = Sentences.Length;



                            //     _                _                 _____         _
                            //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                            //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                            //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                            // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                            //                        |___/

                            int[] Sentence_WC = new int[Sentences.Length];
                            VaderSharp.SentimentAnalysisResults[] results = new VaderSharp.SentimentAnalysisResults[Sentences.Length];

                            for (int i = 0; i < Sentences.Length; i++)
                            {
                                results[i]     = VADER.PolarityScores(Sentences[i]);
                                Sentence_WC[i] = Sentences[i].Split().Where(x => !string.IsNullOrWhiteSpace(x)).ToArray().Length;
                            }



                            // __        __    _ _          ___        _               _
                            // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                            //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                            //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                            //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                            //                                            |_|



                            string[] OutputString = new string[8];
                            OutputString[0] = "\"" + Filename_Clean + "\"";
                            OutputString[1] = "0";
                            OutputString[2] = TotalSentences.ToString();
                            OutputString[3] = "";

                            int TotalWC = 0;

                            if (TotalStringLength > 0)
                            {
                                Dictionary <string, double> Average_Results = new Dictionary <string, double>();
                                Average_Results.Add("Positive", 0.0);
                                Average_Results.Add("Neutral", 0.0);
                                Average_Results.Add("Negative", 0.0);
                                Average_Results.Add("Compound", 0.0);

                                for (int i = 0; i < TotalSentences; i++)
                                {
                                    TotalWC += Sentence_WC[i];
                                    Average_Results["Positive"] += results[i].Positive;
                                    Average_Results["Neutral"]  += results[i].Neutral;
                                    Average_Results["Negative"] += results[i].Negative;
                                    Average_Results["Compound"] += results[i].Compound;

                                    //write the sentence-level output
                                    string[] OutputString_Sentence_Level = new string[8];
                                    OutputString_Sentence_Level[0] = "\"" + Filename_Clean + "\"";
                                    OutputString_Sentence_Level[1] = Sentence_WC[i].ToString();
                                    OutputString_Sentence_Level[2] = "\"" + Sentences[i].Replace("\"", "\"\"") + "\"";
                                    OutputString_Sentence_Level[3] = "";

                                    if (results[i].Compound > 0.05)
                                    {
                                        OutputString_Sentence_Level[3] = "pos";
                                    }
                                    else if (results[i].Compound > -0.05)
                                    {
                                        OutputString_Sentence_Level[3] = "neut";
                                    }
                                    else
                                    {
                                        OutputString_Sentence_Level[3] = "neg";
                                    }

                                    OutputString_Sentence_Level[4] = results[i].Compound.ToString();
                                    OutputString_Sentence_Level[5] = results[i].Positive.ToString();
                                    OutputString_Sentence_Level[6] = results[i].Negative.ToString();
                                    OutputString_Sentence_Level[7] = results[i].Neutral.ToString();

                                    outputFileSentences.WriteLine(String.Join(",", OutputString_Sentence_Level));
                                }

                                Average_Results["Positive"] = Average_Results["Positive"] / (double)TotalSentences;
                                Average_Results["Neutral"]  = Average_Results["Neutral"] / (double)TotalSentences;
                                Average_Results["Negative"] = Average_Results["Negative"] / (double)TotalSentences;
                                Average_Results["Compound"] = Average_Results["Compound"] / (double)TotalSentences;


                                OutputString[1] = TotalWC.ToString();
                                OutputString[4] = Average_Results["Compound"].ToString();
                                OutputString[5] = Average_Results["Positive"].ToString();
                                OutputString[6] = Average_Results["Negative"].ToString();
                                OutputString[7] = Average_Results["Neutral"].ToString();

                                if (Average_Results["Compound"] > 0.05)
                                {
                                    OutputString[3] = "pos";
                                }
                                else if (Average_Results["Compound"] > -0.05)
                                {
                                    OutputString[3] = "neut";
                                }
                                else
                                {
                                    OutputString[3] = "neg";
                                }
                            }


                            else
                            {
                                OutputString[2] = "";
                                for (int i = 3; i < 8; i++)
                                {
                                    OutputString[i + 3] = "";
                                }
                            }


                            outputFile.WriteLine(String.Join(",", OutputString));
                        }
                    }
                }
            }
            catch
            {
                MessageBox.Show("VADER-Tots encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while VADER-Tots is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Beispiel #13
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            DictionaryData DictData = (DictionaryData)e.Argument;


            //selects the text encoding based on user selection
            Encoding InputSelectedEncoding  = null;
            Encoding OutputSelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                InputSelectedEncoding  = Encoding.GetEncoding(InputEncodingDropdown.SelectedItem.ToString());
                OutputSelectedEncoding = Encoding.GetEncoding(OutputEncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(DictData.TextFileFolder, DictData.FileExtension, SearchDepth);



            try {
                //we want to be conservative and limit the number of threads to the number of processors that we have
                var options = new ParallelOptions {
                    MaxDegreeOfParallelism = Environment.ProcessorCount
                };
                Parallel.ForEach(files, options, (string fileName) =>
                {
                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);

                    string SubDirStructure = Path.GetDirectoryName(fileName).Replace(DictData.TextFileFolder, "").TrimStart('\\');



                    //creates subdirs if they don't exist
                    string Output_Location = DictData.OutputFileLocation + '\\' + SubDirStructure;

                    if (!Directory.Exists(Output_Location))
                    {
                        Directory.CreateDirectory(Output_Location);
                    }

                    Output_Location = Path.Combine(Output_Location, Path.GetFileName(fileName));

                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Processing: " + Filename_Clean;
                        FilenameLabel.Invalidate();
                        FilenameLabel.Update();
                        FilenameLabel.Refresh();
                        Application.DoEvents();
                    });



                    // __        __    _ _          ___        _               _
                    // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                    //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                    //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                    //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                    //                                            |_|


                    using (StreamReader inputfile = new StreamReader(fileName, InputSelectedEncoding))
                    {
                        string readText = inputfile.ReadToEnd();

                        if (DictData.FixNULtermination)
                        {
                            readText = string.Join("", readText.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries));
                        }

                        //open up the output file
                        using (StreamWriter outputFile = new StreamWriter(new FileStream(Output_Location, FileMode.Create), OutputSelectedEncoding))
                        {
                            outputFile.Write(readText);
                        }
                    }
                });
            }
            catch
            {
                MessageBox.Show("TranscodeTXT encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while transcoding", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Beispiel #14
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            DictionaryData DictData = (DictionaryData)e.Argument;


            //selects the text encoding based on user selection

            Encoding OutputSelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                OutputSelectedEncoding = Encoding.GetEncoding(OutputEncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(DictData.TextFileFolder, DictData.FileExtension, SearchDepth);



            try {
                using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), OutputSelectedEncoding))
                {
                    outputFile.WriteLine("\"Filename\",\"Created\",\"FileSizeKB\",\"Encoding\"");

                    //add some CODE TO WRITE THE HEADER FOR YOUR CSV FILE HERE!!!!



                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);

                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                            FilenameLabel.Invalidate();
                            FilenameLabel.Update();
                            FilenameLabel.Refresh();
                            Application.DoEvents();
                        });


                        string[] OutputString = new string[4];

                        FileInfo oFileInfo            = new FileInfo(fileName);
                        string   FileEncodingDetected = ExamineTXT.SimpleHelpers.FileEncoding.DetectFileEncoding(fileName);

                        string DetectedEncodingString = "[UNKNOWN]";

                        if (FileEncodingDetected != null)
                        {
                            DetectedEncodingString = FileEncodingDetected;
                        }

                        OutputString[0] = fileName;
                        OutputString[1] = oFileInfo.CreationTime.ToString();
                        OutputString[2] = (oFileInfo.Length / 1024.0).ToString("#.##");
                        OutputString[3] = DetectedEncodingString;



                        outputFile.WriteLine("\"" + string.Join("\",\"", OutputString) + "\"");
                    }
                }
            }
            catch
            {
                MessageBox.Show("ExamineTXT encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while transcoding", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Beispiel #15
0
        private void BgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            uint WordWindowSize  = 100;
            uint MaxPhraseLength = 3;
            uint BigWordSize     = 6;

            //set up our sentence boundary detection
            Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
                WordWindowSize   = Convert.ToUInt32(WordWindowSizeTextbox.Text);
                MaxPhraseLength  = Convert.ToUInt32(PhraseLengthTextbox.Text);
                BigWordSize      = Convert.ToUInt32(BigWordTextBox.Text);
            });

            if (WordWindowSize < 2)
            {
                WordWindowSize = 2;
            }
            if (MaxPhraseLength > WordWindowSize - 1)
            {
                MaxPhraseLength = WordWindowSize - 1;
            }
            if (MaxPhraseLength < 1)
            {
                MaxPhraseLength = 1;
            }


            //the very first thing that we want to do is set up our function word lists
            List <string> FunctionWordWildcardList = new List <string>();
            List <string> FunctionWordsToHash      = new List <string>();

            string[] OriginalFunctionWordList = NewlineClean.Split(FunctionWordTextBox.Text.ToLower());
            OriginalFunctionWordList = OriginalFunctionWordList.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();

            foreach (string Word in OriginalFunctionWordList)
            {
                string WordToParse = Word.Trim();

                if (WordToParse.Contains('*'))
                {
                    FunctionWordWildcardList.Add(WordToParse.Replace("*", ""));
                }
                else
                {
                    FunctionWordsToHash.Add(WordToParse);
                }
            }

            //remove duplicates
            FunctionWordWildcardList = FunctionWordWildcardList.Distinct().ToList();
            FunctionWordsToHash      = FunctionWordsToHash.Distinct().ToList();

            HashSet <string> HashedFuncWords = new HashSet <string>(FunctionWordsToHash);

            string[] FunctionWordWildCards = FunctionWordWildcardList.ToArray();

            FunctionWordsToHash      = null;
            FunctionWordWildcardList = null;



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth);



            try {
                using (StreamWriter outputFile = new StreamWriter(((string[])e.Argument)[1]))
                {
                    string HeaderString = "\"Filename\",\"WC\",\"BigWordPercent\",\"AvgUniqueWPWindow\",\"Overall_Repeat_1word\",\"Funct_Repeat_1word\",\"Content_Repeat_1word\",\"BigWordRepeat\"";

                    for (ushort i = 2; i <= MaxPhraseLength; i += 1)
                    {
                        HeaderString += ",\"Overall_Repeat_" + i.ToString() + "word\"";
                    }

                    outputFile.WriteLine(HeaderString);


                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean     = Path.GetFileName(fileName);
                        int    TotalNumberOfWords = 0;

                        double AvgUniqueWPWindow = 0;
                        double TotalRepetition   = 0.0;
                        //double AvgWPS = 0.0;

                        double FunctionWordRepetition = 0.0;
                        double ContentWordRepetition  = 0.0;
                        double SixLtrWordRepetition   = 0;
                        ulong  SixLtrWordsTotal       = 0;

                        //sets up our word phrase dictionaries
                        Dictionary <int, double> PhraseDict = new Dictionary <int, double>();
                        for (ushort i = 2; i <= MaxPhraseLength; i += 1)
                        {
                            PhraseDict.Add(i, 0.0);
                        }


                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //do stuff here
                        string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower();


                        readText = NewlineClean.Replace(readText, " ");

                        //remove all the junk punctuation
                        foreach (char c in PunctuationBox.Text)
                        {
                            readText = readText.Replace(c, ' ');
                        }



                        //splits everything out into words
                        string[] Words = readText.Trim().Split(' ');
                        Words = Words.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();
                        for (int i = 0; i < Words.Length; i++)
                        {
                            if (Words[i].Length > BigWordSize - 1)
                            {
                                SixLtrWordsTotal++;
                            }
                        }

                        TotalNumberOfWords += Words.Count();
                        UInt64 ContentWordsDenominator  = 0;
                        UInt64 FunctionWordsDenominator = 0;

                        UInt64 WordWindowIterations = 0;
                        //make sure that the text is at least long enough to analyze
                        if (TotalNumberOfWords >= WordWindowSize)
                        {
                            //this is where we make a moving window
                            for (uint BigCounter = 0; BigCounter <= (Words.Length - WordWindowSize); BigCounter += 1)
                            {
                                WordWindowIterations += 1;

                                var WordWindow = new string[WordWindowSize];
                                Array.Copy(Words, BigCounter, WordWindow, 0, WordWindowSize);

                                //do our full phrase repetition measures
                                for (int i = 2; i <= MaxPhraseLength; i += 1)
                                {
                                    var PhraseWindow = new string[WordWindowSize - (i - 1)];
                                    for (int j = 0; j <= (WordWindowSize - i); j += 1)
                                    {
                                        string[] temp_phrase = new string[i];
                                        Array.Copy(Words, j, temp_phrase, 0, i);
                                        PhraseWindow[j] = String.Join(" ", temp_phrase);
                                    }
                                    //add in the unique phrase percentage
                                    PhraseDict[i] += PhraseWindow.Distinct().ToArray().Length / ((double)WordWindowSize - (i - 1));
                                }

                                //AvgWPS += Words.Count();
                                AvgUniqueWPWindow += WordWindow.Distinct().ToArray().Length;
                                TotalRepetition   += WordWindow.Distinct().ToArray().Length / (double)WordWindowSize;

                                //now we go through and redo the same thing, separately for function words and content words
                                //the first thing that we need to do is separate out the function words from the content words
                                List <string> FunctionWords = new List <string>();
                                List <string> ContentWords  = new List <string>();
                                List <string> SixLtrWords   = new List <string>();

                                for (int i = 0; i < WordWindow.Length; i++)
                                {
                                    //check the length of the word
                                    if (WordWindow[i].Length > BigWordSize - 1)
                                    {
                                        SixLtrWords.Add(WordWindow[i]);
                                    }

                                    //first, check with the hashset
                                    if (HashedFuncWords.Contains(WordWindow[i]))
                                    {
                                        FunctionWords.Add(WordWindow[i]);
                                        continue;
                                    }

                                    //if it wasn't found in the hashset, we'll loop through the wildcard function words
                                    for (int j = 0; j < FunctionWordWildCards.Count(); j++)
                                    {
                                        if (WordWindow[i].StartsWith(FunctionWordWildCards[j]))
                                        {
                                            FunctionWords.Add(WordWindow[i]);
                                            continue;
                                        }
                                    }

                                    //if we haven't moved on to the next word yet, then this is a content word
                                    ContentWords.Add(WordWindow[i]);
                                }

                                if (ContentWords.Count() > 0)
                                {
                                    ContentWordRepetition   += ContentWords.Distinct().ToArray().Length / (double)ContentWords.Count();
                                    ContentWordsDenominator += 1;
                                }
                                if (FunctionWords.Count() > 0)
                                {
                                    FunctionWordRepetition   += FunctionWords.Distinct().ToArray().Length / (double)FunctionWords.Count();
                                    FunctionWordsDenominator += 1;
                                }

                                if (SixLtrWords.Count() > 0)
                                {
                                    SixLtrWordRepetition += SixLtrWords.Distinct().ToArray().Length / (double)SixLtrWords.Count();
                                }
                            }
                        }



                        //divide everything by the number of sentences
                        TotalRepetition        = (float)TotalRepetition / (TotalNumberOfWords - (WordWindowSize - 1));
                        FunctionWordRepetition = (float)FunctionWordRepetition / FunctionWordsDenominator;
                        ContentWordRepetition  = (float)ContentWordRepetition / ContentWordsDenominator;
                        SixLtrWordRepetition   = (float)SixLtrWordRepetition / (TotalNumberOfWords - (WordWindowSize - 1));
                        AvgUniqueWPWindow      = (float)AvgUniqueWPWindow / (TotalNumberOfWords - (WordWindowSize - 1));


                        if (TotalNumberOfWords >= WordWindowSize)
                        {
                            string[] OutputString = new string[8 + MaxPhraseLength - 1];

                            OutputString[0] = '"' + Filename_Clean + '"';
                            OutputString[1] = TotalNumberOfWords.ToString();
                            OutputString[2] = Math.Round((SixLtrWordsTotal / (double)TotalNumberOfWords) * 100, 3).ToString();
                            OutputString[3] = Math.Round(AvgUniqueWPWindow, 3).ToString();
                            OutputString[4] = Math.Round((1 - TotalRepetition) * 100, 3).ToString();
                            OutputString[5] = Math.Round((1 - FunctionWordRepetition) * 100, 3).ToString();
                            OutputString[6] = Math.Round((1 - ContentWordRepetition) * 100, 3).ToString();
                            OutputString[7] = Math.Round((1 - SixLtrWordRepetition) * 100, 3).ToString();

                            for (int i = 0; i < MaxPhraseLength - 1; i += 1)
                            {
                                OutputString[8 + i] = Math.Round((1 - (PhraseDict[i + 2] / ((float)TotalNumberOfWords - (WordWindowSize - 1)))) * 100, 3).ToString();
                            }

                            outputFile.WriteLine(String.Join(",", OutputString));
                        }
                        else
                        {
                            outputFile.WriteLine('"' + Filename_Clean + '"' + "," + TotalNumberOfWords.ToString());
                        }
                    }
                }
            }
            catch
            {
                MessageBox.Show("Repeatalizer could not open your output file\r\nfor writing. Is the file open in another application?");
            }
        }
Beispiel #16
0
        //  ____   ______        __         _                ___   _                          _     _  __ _   _           __
        // | __ ) / ___\ \      / /__  _ __| | _____ _ __   / / | | | ___  __ ___   ___   _  | |   (_)/ _| |_(_)_ __   __ \ \ 
        // |  _ \| |  _ \ \ /\ / / _ \| '__| |/ / _ \ '__| | || |_| |/ _ \/ _` \ \ / / | | | | |   | | |_| __| | '_ \ / _` | |
        // | |_) | |_| | \ V  V / (_) | |  |   <  __/ |    | ||  _  |  __/ (_| |\ V /| |_| | | |___| |  _| |_| | | | | (_| | |
        // |____/ \____|  \_/\_/ \___/|_|  |_|\_\___|_|    | ||_| |_|\___|\__,_| \_/  \__, | |_____|_|_|  \__|_|_| |_|\__, | |
        //                                                  \_\                       |___/                           |___/_/



        private void BgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //here, we're basically unpacking and redefining all of the core information that was
            //passed to the background worker. it's a bit redundant and not super efficient, but the
            //loss of efficiency is more than made up for by the gains in readability

            BgWorkerInformation BgData = (BgWorkerInformation)e.Argument;

            Encoding SelectedEncoding = null;

            string InputFile  = BgData.InputFile;
            bool   HasHeaders = Convert.ToBoolean(BgData.HasHeaders);

            string[] Delimiters  = BgData.Delimiters.ToCharArray().Select(c => c.ToString()).ToArray();;
            bool     UsingQuotes = Convert.ToBoolean(BgData.UsingQuotes);

            //initialize what we'll need later


            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });

            ulong Total_Number_of_Tokens = Convert.ToUInt64(BgData.Tokens_Altogether.Count());
            ulong number_of_word_lists   = Convert.ToUInt64(BgData.Tokens.Count());


            int vectorlength = BgData.EndingCol - BgData.StartingCol + 1;

            double[][] averagevector = new double[number_of_word_lists][];
            for (ulong i = 0; i < number_of_word_lists; i++)
            {
                averagevector[i] = new double[vectorlength];
                for (int j = 0; j < vectorlength; j++)
                {
                    averagevector[i][j] = 0;
                }
            }



            try
            {
                // create the parser
                using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding))
                {
                    using (StreamWriter outputFile_subvectors = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_Subvectors.txt"),
                                                                                                FileMode.Create, FileAccess.Write), SelectedEncoding))
                    {
                        // set the parser properties
                        parser.TrimWhiteSpace = true; //trim the whitespace to make sure that files/folder names don't end with a space, which will break the program
                        parser.TextFieldType  = FieldType.Delimited;
                        parser.SetDelimiters(Delimiters);
                        parser.HasFieldsEnclosedInQuotes = UsingQuotes;

                        //this is used for header handling and reporting
                        bool  firstLine  = true;
                        ulong LineNumber = 0;
                        ulong detected_tokens_altogether = 0;



                        ulong[] detected_tokens_per_wordlist = new ulong[number_of_word_lists];
                        for (ulong i = 0; i < number_of_word_lists; i++)
                        {
                            detected_tokens_per_wordlist[i] = 0;
                        }

                        HashSet <string>[] Detected_Token_Hashset = new HashSet <string> [BgData.Tokens.Length];
                        for (int i = 0; i < BgData.Tokens.Length; i++)
                        {
                            Detected_Token_Hashset[i] = new HashSet <string>();
                        }

                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Preparing...";
                        });



                        //Loop through each row of the dataset
                        while (!parser.EndOfData && !BgWorker.CancellationPending)
                        {
                            //parse out the row
                            string[] fields = parser.ReadFields();

                            LineNumber++;

                            //report what row we're working on
                            if (LineNumber % 100 == 0)
                            {
                                FilenameLabel.Invoke((MethodInvoker) delegate
                                {
                                    FilenameLabel.Text = "Getting average vector(s)... Currently reading row #" + LineNumber.ToString();
                                });
                            }


                            // get the column headers
                            if (firstLine)
                            {
                                firstLine = false;
                                //essentially, if the first line of the dataset is headers, we'll just skip on to the next line
                                if (HasHeaders)
                                {
                                    LineNumber--;
                                    continue;
                                }
                            }



                            //first, we want to know if the row even contains a token in our list:
                            if (BgData.Tokens_Altogether.Contains(fields[BgData.TokenCol]))
                            {
                                detected_tokens_altogether++;
                                //if it does, then we go in and figure out which word lists contain the word in
                                //question, and do the basic "add word vectors" for each word list that contains it
                                for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++)
                                {
                                    if (BgData.Tokens[wordlist_counter].Contains(fields[BgData.TokenCol]))
                                    {
                                        Detected_Token_Hashset[wordlist_counter].Add(fields[BgData.TokenCol]);

                                        detected_tokens_per_wordlist[wordlist_counter]++;

                                        try
                                        {
                                            //copy just the vector into a new array
                                            string[] vector = new string[vectorlength];
                                            Array.Copy(fields, BgData.StartingCol, vector, 0, vectorlength);
                                            double[] vector_numeric = Array.ConvertAll(vector, Double.Parse);

                                            outputFile_subvectors.WriteLine(fields[BgData.TokenCol] + "\t" + string.Join("\t", vector));

                                            //add values from the new vector
                                            for (int i = 0; i < vectorlength; i++)
                                            {
                                                averagevector[wordlist_counter][i] += vector_numeric[i];
                                            }
                                        }
                                        catch
                                        {
                                            DialogResult result = MessageBox.Show("There was an error reading your vectors." + "\r\n" +
                                                                                  "Are you sure that you selected columns that only contain numbers?",
                                                                                  "Vector parsing error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                                            e.Cancel = true;
                                            break;
                                        }
                                    }
                                }
                            }


                            //if we've found all of the tokens, we don't need to keep looking
                            if (detected_tokens_altogether == Total_Number_of_Tokens)
                            {
                                break;
                            }


                            if (e.Cancel)
                            {
                                break;
                            }


                            //end of while for going through data
                        }



                        //let user know if there was an issue with finding tokens
                        if (detected_tokens_altogether == 0)
                        {
                            MessageBox.Show("None of the tokens in your list were found.",
                                            "No Tokens Found", MessageBoxButtons.OK, MessageBoxIcon.Error);
                            e.Cancel = true;
                        }



                        if (!e.Cancel)
                        {
                            //probably write a file of tokens that *were* captured
                            StringBuilder tokens_found_output = new StringBuilder();



                            for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++)
                            {
                                //calculate the average vector
                                //add values from the new vector
                                for (int i = 0; i < vectorlength; i++)
                                {
                                    averagevector[wordlist_counter][i] = averagevector[wordlist_counter][i] / detected_tokens_per_wordlist[wordlist_counter];
                                }

                                string[]      tokens_as_array  = BgData.Tokens[wordlist_counter].ToArray();
                                List <string> UndetectedTokens = new List <string>();
                                //figure out which words were not caught
                                for (int i = 0; i < tokens_as_array.Length; i++)
                                {
                                    if (!Detected_Token_Hashset[wordlist_counter].Contains(tokens_as_array[i]))
                                    {
                                        UndetectedTokens.Add(tokens_as_array[i]);
                                    }
                                }

                                tokens_found_output.Append("\r\n------------------------------------------------\r\n" +
                                                           "TOKENS FOUND, WORD GROUP #" + (wordlist_counter + 1).ToString() + ":" +
                                                           "\r\n------------------------------------------------\r\n" +
                                                           string.Join("\r\n", Detected_Token_Hashset[wordlist_counter]));

                                tokens_found_output.Append("\r\n\r\n\r\n" +
                                                           "\r\n------------------------------------------------\r\n" +
                                                           "TOKENS NOT FOUND, WORD GROUP #" + (wordlist_counter + 1).ToString() + ":" +
                                                           "\r\n------------------------------------------------\r\n" +
                                                           string.Join("\r\n", UndetectedTokens) +
                                                           "\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n");
                            }



                            try
                            {
                                using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_AvgVector.txt"),
                                                                                                 FileMode.Create, FileAccess.Write), SelectedEncoding))
                                {
                                    for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++)
                                    {
                                        outputFile.WriteLine("Word_Group_" + (wordlist_counter + 1).ToString() + "\t" +
                                                             string.Join("\t", averagevector[wordlist_counter]));
                                    }
                                }



                                using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_TokensFound.txt"),
                                                                                                 FileMode.Create, FileAccess.Write), SelectedEncoding))
                                {
                                    outputFile.Write(tokens_found_output);
                                }
                            }

                            catch
                            {
                                DialogResult result = MessageBox.Show("There was an error writing your output.",
                                                                      "Write file error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                                e.Cancel = true;
                            }
                        }

                        //end "using" for retained vector output
                    }

                    //end of "using" textfieldparser
                }

                //end of try
            }
            catch
            {
                DialogResult result = MessageBox.Show("An error occurred somewhere while trying to parse your model file.",
                                                      "General Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                e.Cancel = true;
            }



            //   ____      _            _       _          ____          _              ____  _           _ _            _ _   _
            //  / ___|__ _| | ___ _   _| | __ _| |_ ___   / ___|___  ___(_)_ __   ___  / ___|(_)_ __ ___ (_) | __ _ _ __(_) |_(_) ___  ___
            // | |   / _` | |/ __| | | | |/ _` | __/ _ \ | |   / _ \/ __| | '_ \ / _ \ \___ \| | '_ ` _ \| | |/ _` | '__| | __| |/ _ \/ __|
            // | |__| (_| | | (__| |_| | | (_| | ||  __/ | |__| (_) \__ \ | | | |  __/  ___) | | | | | | | | | (_| | |  | | |_| |  __/\__ \
            //  \____\__,_|_|\___|\__,_|_|\__,_|\__\___|  \____\___/|___/_|_| |_|\___| |____/|_|_| |_| |_|_|_|\__,_|_|  |_|\__|_|\___||___/
            //

            try
            {
                if (!e.Cancel)
                {
                    using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding))
                    {
                        // set the parser properties
                        parser.TrimWhiteSpace = true; //trim the whitespace to make sure that files/folder names don't end with a space, which will break the program
                        parser.TextFieldType  = FieldType.Delimited;
                        parser.SetDelimiters(Delimiters);
                        parser.HasFieldsEnclosedInQuotes = UsingQuotes;

                        //this is used for header handling and reporting
                        bool  firstLine  = true;
                        ulong LineNumber = 0;



                        using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_CosineSim.csv"),
                                                                                         FileMode.Create, FileAccess.Write), SelectedEncoding))
                        {
                            //write the header row
                            string header = "\"Token\"";
                            for (ulong i = 0; i < number_of_word_lists; i++)
                            {
                                header += ",\"Grp_" + (i + 1).ToString() + "_CosineSim\"";
                            }
                            outputFile.WriteLine(header);


                            while (!parser.EndOfData && !BgWorker.CancellationPending)
                            {
                                //parse out the row
                                string[] fields = parser.ReadFields();

                                LineNumber++;

                                //report what row we're working on
                                if (LineNumber % 100 == 0)
                                {
                                    FilenameLabel.Invoke((MethodInvoker) delegate
                                    {
                                        FilenameLabel.Text = "Calculating cosine similarities... Currently reading row #" + LineNumber.ToString();
                                    });
                                }


                                // get the column headers
                                if (firstLine)
                                {
                                    firstLine = false;
                                    //essentially, if the first line of the dataset is headers, we'll just skip on to the next line
                                    if (HasHeaders)
                                    {
                                        LineNumber--;
                                        continue;
                                    }
                                }

                                try
                                {
                                    //if if's not the header row, then let's  get the vector
                                    string[] vector = new string[vectorlength];
                                    Array.Copy(fields, BgData.StartingCol, vector, 0, vectorlength);
                                    double[] vector_numeric = Array.ConvertAll(vector, Double.Parse);

                                    //let's calculate the cosine similarity between our mean vector
                                    //and the token on the current row

                                    //https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/
                                    //Cosine Similarity (d1, d2) =  Dot product(d1, d2) / ||d1|| * ||d2||
                                    //
                                    //Dot product (d1,d2) = d1[0] * d2[0] + d1[1] * d2[1] * … * d1[n] * d2[n]
                                    //||d1|| = square root(d1[0]2 + d1[1]2 + ... + d1[n]2)
                                    //||d2|| = square root(d2[0]2 + d2[1]2 + ... + d2[n]2)



                                    bool     at_least_one_cossim = false;
                                    double[] CosineSims          = new double[number_of_word_lists];


                                    for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++)
                                    {
                                        double dotproduct = 0;
                                        double d1         = 0;
                                        double d2         = 0;

                                        //calculate cosine similarity components
                                        for (int i = 0; i < vectorlength; i++)
                                        {
                                            dotproduct += averagevector[wordlist_counter][i] * vector_numeric[i];
                                            d1         += averagevector[wordlist_counter][i] * averagevector[wordlist_counter][i];
                                            d2         += vector_numeric[i] * vector_numeric[i];
                                        }

                                        CosineSims[wordlist_counter] = dotproduct / (Math.Sqrt(d1) * Math.Sqrt(d2));

                                        if (Math.Abs(CosineSims[wordlist_counter]) > BgData.OmitBelowValue)
                                        {
                                            at_least_one_cossim = true;
                                        }
                                    }



                                    if (BgData.OmitBelowValue == 0.0 || at_least_one_cossim)
                                    {
                                        StringBuilder LineToWrite = new StringBuilder();

                                        //write the output, making sure to escape quotes
                                        if (fields[BgData.TokenCol].Contains('"'))
                                        {
                                            LineToWrite.Append("\"" + fields[BgData.TokenCol].Replace("\"", "\"\"") + "\"");
                                        }
                                        else
                                        {
                                            LineToWrite.Append("\"" + fields[BgData.TokenCol] + "\"");
                                        }



                                        for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++)
                                        {
                                            if (BgData.OmitBelowValue == 0.0 || Math.Abs(CosineSims[wordlist_counter]) > BgData.OmitBelowValue)
                                            {
                                                LineToWrite.Append("," + CosineSims[wordlist_counter]);
                                            }
                                            else
                                            {
                                                LineToWrite.Append(",");
                                            }
                                        }

                                        outputFile.WriteLine(LineToWrite);
                                    }
                                }
                                catch
                                {
                                    DialogResult result = MessageBox.Show("There was an error reading your vectors." + "\r\n" +
                                                                          "Are you sure that you selected columns that only contain numbers?",
                                                                          "Vector parsing error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                                    e.Cancel = true;
                                    break;
                                }



                                //end while
                            }

                            //end "using" for file output
                        }


                        //end "using" for textfieldparser
                    }


                    //end of "if e.cancel is false"
                }

                //end of try
            }
            catch
            {
                DialogResult result = MessageBox.Show("An error occurred somewhere while trying to calculate similarities.",
                                                      "General Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                e.Cancel = true;
            }
        }
Beispiel #17
0
        private void BgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });


            //report what we're working on
            FilenameLabel.Invoke((MethodInvoker) delegate
            {
                FilenameLabel.Text = "Loading model... please wait...";
            });

            // Path to the folder with models
            var segmenterData = Path.Combine(Path.GetDirectoryName(AppDomain.CurrentDomain.BaseDirectory), @"data");

            var props = new Properties();

            props.setProperty("sighanCorporaDict", segmenterData);
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("inputEncoding", SelectedEncoding.ToString());
            props.setProperty("sighanPostProcessing", "true");

            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);


            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth);



            try {
                string outputdir = Path.Combine(((string[])e.Argument)[1]);

                Directory.CreateDirectory(outputdir);



                foreach (string fileName in files)
                {
                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);



                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                    });



                    //do stuff here
                    string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower();

                    string TokenResults = segmenter.classifyToString(readText);


                    using (System.IO.StreamWriter fileout =
                               new StreamWriter(Path.Combine(outputdir, Filename_Clean), false, SelectedEncoding))
                    {
                        fileout.Write(TokenResults);
                    }
                }
            }
            catch
            {
                MessageBox.Show("ZhToken encountered a problem while trying to tokenize/write a file.");
            }
        }
Beispiel #18
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            BackgroundWorkerData BGData = (BackgroundWorkerData)e.Argument;


            //report what we're working on
            FilenameLabel.Invoke((MethodInvoker) delegate
            {
                FilenameLabel.Text = "Loading model...";
            });


            //set up our sentence boundary detection
            Regex SentenceSplitter = new Regex(@"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(BGData.TextFileFolder, "*.txt", SearchDepth);



            try {
                var tagger = new MaxentTagger(modelsDirectory + @"/" + BGData.SelectedModel);

                int NumberOfTagsInModel = tagger.numTags();

                List <string> tags_list_header = new List <string>();
                List <string> tags_list        = new List <string>();


                for (int i = 0; i < NumberOfTagsInModel; i++)
                {
                    tags_list_header.Add("\"" + tagger.getTag(i) + "\"");
                    tags_list.Add(tagger.getTag(i));
                }

                tags_list_header.Sort();
                tags_list.Sort();

                string[] tags_array = tags_list.ToArray();



                //open up the output file
                using (StreamWriter outputFile = new StreamWriter(new FileStream(BGData.OutputFileLocation, FileMode.Create), SelectedEncoding))
                {
                    //write the header row to the output file
                    StringBuilder HeaderString = new StringBuilder();
                    HeaderString.Append("\"Filename\",\"Segment\",\"TokenCount\",\"SentenceCount\"," + string.Join(",", tags_list_header.ToArray()));

                    if (BGData.OutputTaggedText)
                    {
                        HeaderString.Append(",\"TaggedText\"");
                    }
                    if (BGData.OrderedPOSTagText)
                    {
                        HeaderString.Append(",\"OrderedPOSTags\"");
                    }

                    outputFile.WriteLine(HeaderString.ToString());


                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);


                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //read in the text file, convert everything to lowercase
                        var InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim();

                        var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(InputText)).toArray();



                        //now that we know how many sentences we have, we can figure out the segmentation
                        double SentencesPerSegment = 1.0;
                        int    NumberOfSegments    = BGData.NumSegments;
                        if (NumberOfSegments > sentences.Length)
                        {
                            NumberOfSegments = sentences.Length;
                        }

                        if (sentences.Length > 0)
                        {
                            SentencesPerSegment = sentences.Length / (double)NumberOfSegments;
                        }


                        List <List <ArrayList> > Sentences_Segmented = new List <List <ArrayList> >();

                        int SegmentCounter = 1;
                        //int SentenceNumberTracker = 0;
                        for (int i = 0; i < sentences.Length; i++)
                        {
                            if (Sentences_Segmented.Count < SegmentCounter)
                            {
                                Sentences_Segmented.Add(new List <ArrayList>());
                            }

                            Sentences_Segmented[SegmentCounter - 1].Add((ArrayList)sentences[i]);
                            //SentenceNumberTracker++;

                            if (i + 1 >= SegmentCounter * SentencesPerSegment)
                            {
                                SegmentCounter++;
                                //SentenceNumberTracker = 0;
                            }
                        }


                        sentences = null;



                        //     _                _                 _____         _
                        //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                        //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                        //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                        // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                        //                        |___/



                        for (int i = 0; i < NumberOfSegments; i++)
                        {
                            Dictionary <string, int> POSSums = new Dictionary <string, int>();
                            for (int j = 0; j < NumberOfTagsInModel; j++)
                            {
                                POSSums.Add(tags_array[j], 0);
                            }


                            StringBuilder TaggedText     = new StringBuilder();
                            StringBuilder OrderedPOSTags = new StringBuilder();

                            int TotalSentences = Sentences_Segmented[i].Count;
                            int TotalWC        = 0;


                            foreach (ArrayList sentence in Sentences_Segmented[i])
                            {
                                var taggedSentence = tagger.tagSentence(sentence);


                                Iterator it = taggedSentence.iterator();



                                while (it.hasNext())
                                {
                                    TaggedWord token = (TaggedWord)it.next();

                                    if (BGData.OutputTaggedText)
                                    {
                                        TaggedText.Append(token.toString() + " ");
                                    }
                                    if (BGData.OrderedPOSTagText)
                                    {
                                        OrderedPOSTags.Append(token.tag() + " ");
                                    }


                                    POSSums[token.tag()] += 1;
                                    TotalWC += 1;

                                    //MessageBox.Show(token.word());
                                }

                                TaggedText.Append(Environment.NewLine);
                                OrderedPOSTags.Append(Environment.NewLine);
                            }



                            // __        __    _ _          ___        _               _
                            // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                            //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                            //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                            //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                            //                                            |_|



                            string[] OutputString = new string[4];
                            OutputString[0] = "\"" + Filename_Clean + "\"";
                            OutputString[1] = (i + 1).ToString();
                            OutputString[2] = TotalWC.ToString();
                            OutputString[3] = TotalSentences.ToString();

                            int include_tagged_text = 0;
                            int include_ordered_pos = 0;
                            if (BGData.OutputTaggedText)
                            {
                                include_tagged_text = 1;
                            }
                            if (BGData.OrderedPOSTagText)
                            {
                                include_ordered_pos = 1;
                            }

                            string[] TagOutputString = new string[NumberOfTagsInModel + include_tagged_text + include_ordered_pos];

                            for (int j = 0; j < NumberOfTagsInModel; j++)
                            {
                                if (BGData.NormalizeOutput && TotalWC > 0)
                                {
                                    TagOutputString[j] = RoundUp(POSSums[tags_array[j]] * 100 / (double)TotalWC, 5).ToString();
                                }
                                else
                                {
                                    TagOutputString[j] = POSSums[tags_array[j]].ToString();
                                }
                            }

                            if (BGData.OutputTaggedText)
                            {
                                TagOutputString[TagOutputString.Length - include_tagged_text - include_ordered_pos] = "\"" + TaggedText.ToString().Replace("\"", "\"\"") + "\"";
                            }
                            if (BGData.OrderedPOSTagText)
                            {
                                TagOutputString[TagOutputString.Length - include_ordered_pos] = "\"" + OrderedPOSTags.ToString().Replace("\"", "\"\"") + "\"";
                            }

                            outputFile.WriteLine(String.Join(",", MergeOutputArrays(OutputString, TagOutputString)));
                        }



                        //end of the "for each file" loop
                    }
                }
            }
            catch (OutOfMemoryException OOM)
            {
                MessageBox.Show("One or more of your files caused an Out of Memory error. This means that you do not have enough RAM to process the current file. This is often caused by extremely complex / messy language samples with run-on sentences or other peculiar constructions, paired with a computer that does not have enough RAM to handle such processing.", "Out of Memory", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
            catch
            {
                MessageBox.Show("POSTModern encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while POSTModern is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Beispiel #19
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            DictionaryData BGWorkerData = (DictionaryData)e.Argument;

            TranslationClient client = TranslationClient.Create();


            //selects the text encoding based on user selection
            Encoding InputSelectedEncoding  = null;
            Encoding OutputSelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                InputSelectedEncoding  = Encoding.GetEncoding(InputEncodingDropdown.SelectedItem.ToString());
                OutputSelectedEncoding = Encoding.GetEncoding(OutputEncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(BGWorkerData.TextFileFolder, BGWorkerData.FileExtension, SearchDepth);



            try {
                foreach (string fileName in files)
                {
                    if (e.Cancel)
                    {
                        break;
                    }



                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);

                    string SubDirStructure = Path.GetDirectoryName(fileName).Replace(BGWorkerData.TextFileFolder, "").TrimStart('\\');


                    //creates subdirs if they don't exist
                    string Output_Location = BGWorkerData.OutputFileLocation + '\\' + SubDirStructure;

                    if (!Directory.Exists(Output_Location))
                    {
                        Directory.CreateDirectory(Output_Location);
                    }

                    Output_Location = Path.Combine(Output_Location, Path.GetFileName(fileName));

                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Processing: " + Filename_Clean;
                        FilenameLabel.Invalidate();
                        FilenameLabel.Update();
                        FilenameLabel.Refresh();
                        Application.DoEvents();
                    });



                    // __        __    _ _          ___        _               _
                    // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                    //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                    //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                    //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                    //                                            |_|


                    using (StreamReader inputfile = new StreamReader(fileName, InputSelectedEncoding))
                    {
                        if (e.Cancel)
                        {
                            break;
                        }

                        string readText = inputfile.ReadToEnd();

                        string[] readText_Chunked = new string[0];

                        if (!string.IsNullOrWhiteSpace(readText))
                        {
                            readText_Chunked = SplitStringByLength(readText, BGWorkerData.MaxCharsPerRequest);
                        }

                        StringBuilder TranslatedText_Output = new StringBuilder();

                        for (int i = 0; i < readText_Chunked.Length; i++)
                        {
                            if (e.Cancel)
                            {
                                break;
                            }

                            try
                            {
                                if (e.Cancel)
                                {
                                    break;
                                }

                                StatusLabel.Invoke((MethodInvoker) delegate
                                {
                                    StatusLabel.Text = "Status: Sending request " + (i + 1).ToString() + "/" + readText_Chunked.Length.ToString() + " to API...";
                                    StatusLabel.Invalidate();
                                    StatusLabel.Update();
                                    StatusLabel.Refresh();
                                    Application.DoEvents();
                                });

                                var response = client.TranslateText(readText_Chunked[i],
                                                                    sourceLanguage: BGWorkerData.InputLang,
                                                                    targetLanguage: BGWorkerData.OutputLang);

                                TranslatedText_Output.Append(response.TranslatedText + " ");
                            }
                            catch (Google.GoogleApiException ex)
                            {
                                if (e.Cancel)
                                {
                                    break;
                                }

                                if (ex.Error.Code == 403)
                                {
                                    if (ex.Error.Message.Contains("Daily Limit Exceeded"))
                                    {
                                        //report what we're working on
                                        StatusLabel.Invoke((MethodInvoker) delegate
                                        {
                                            StatusLabel.Text = "Status: " + ex.Error.Message;
                                            StatusLabel.Invalidate();
                                            StatusLabel.Update();
                                            StatusLabel.Refresh();
                                            Application.DoEvents();
                                        });

                                        MessageBox.Show("The Google Translate API reports that you have exceeded your daily use limit. You will need to visit the \"Quotas\" section of the Google Cloud Dashboard to increase your limits or, alternatively, wait until midnight for your quota to reset.", "Daily Limit Exceeded", MessageBoxButtons.OK, MessageBoxIcon.Stop);
                                        e.Cancel = true;
                                        break;
                                    }

                                    else
                                    {
                                        if (e.Cancel)
                                        {
                                            break;
                                        }

                                        int retry_counter = 0;
                                        while (retry_counter < BGWorkerData.MaxRetries)
                                        {
                                            retry_counter++;

                                            int      TimerCounter = 0;
                                            DateTime d            = DateTime.Now;

                                            while (TimerCounter < BGWorkerData.DurationLength + 2)
                                            {
                                                TimeSpan ts = DateTime.Now.Subtract(d);
                                                if (ts.Seconds >= 1)
                                                {
                                                    //do some work
                                                    TimerCounter += ts.Seconds;
                                                    d             = DateTime.Now;

                                                    //report what we're working on
                                                    StatusLabel.Invoke((MethodInvoker) delegate
                                                    {
                                                        StatusLabel.Text = "Status: Rate limit reached. Sleeping for " + (BGWorkerData.DurationLength - TimerCounter + 1).ToString() + "...";
                                                        StatusLabel.Invalidate();
                                                        StatusLabel.Update();
                                                        StatusLabel.Refresh();
                                                        Application.DoEvents();
                                                    });
                                                }
                                            }

                                            try
                                            {
                                                //report what we're working on
                                                StatusLabel.Invoke((MethodInvoker) delegate
                                                {
                                                    StatusLabel.Text = "Status: Sending request " + (i + 1).ToString() + "/" + readText_Chunked.Length.ToString() + " to API... Retry #" + retry_counter.ToString();
                                                    StatusLabel.Invalidate();
                                                    StatusLabel.Update();
                                                    StatusLabel.Refresh();
                                                    Application.DoEvents();
                                                });

                                                var response = client.TranslateText(readText_Chunked[i],
                                                                                    sourceLanguage: BGWorkerData.InputLang,
                                                                                    targetLanguage: BGWorkerData.OutputLang);

                                                TranslatedText_Output.Append(response.TranslatedText + " ");

                                                retry_counter = BGWorkerData.MaxRetries;
                                            }
                                            catch
                                            {
                                            }
                                        }
                                    }
                                }

                                else if (ex.Error.Code == 429 || (ex.Error.Code >= 500 && ex.Error.Code < 600))
                                {
                                    int retry_counter = 0;
                                    while (retry_counter < BGWorkerData.MaxRetries)
                                    {
                                        retry_counter++;

                                        int      TimerCounter = 0;
                                        DateTime d            = DateTime.Now;

                                        while (TimerCounter < System.Math.Pow(retry_counter, 2))
                                        {
                                            TimeSpan ts = DateTime.Now.Subtract(d);
                                            if (ts.Seconds >= 1)
                                            {
                                                //do some work
                                                TimerCounter += ts.Seconds;
                                                d             = DateTime.Now;

                                                //report what we're working on
                                                StatusLabel.Invoke((MethodInvoker) delegate
                                                {
                                                    StatusLabel.Text = "Status: Error " + ex.Error.Code.ToString() + "; " + ex.Error.Message + " -- Retrying in " + (BGWorkerData.DurationLength - TimerCounter + 1).ToString() + "...";
                                                    StatusLabel.Invalidate();
                                                    StatusLabel.Update();
                                                    StatusLabel.Refresh();
                                                    Application.DoEvents();
                                                });
                                            }
                                        }

                                        try
                                        {
                                            //report what we're working on
                                            StatusLabel.Invoke((MethodInvoker) delegate
                                            {
                                                StatusLabel.Text = "Status: Sending request " + (i + 1).ToString() + "/" + readText_Chunked.Length.ToString() + " to API... Retry #" + retry_counter.ToString();
                                                StatusLabel.Invalidate();
                                                StatusLabel.Update();
                                                StatusLabel.Refresh();
                                                Application.DoEvents();
                                            });

                                            var response = client.TranslateText(readText_Chunked[i],
                                                                                sourceLanguage: BGWorkerData.InputLang,
                                                                                targetLanguage: BGWorkerData.OutputLang);

                                            TranslatedText_Output.Append(response.TranslatedText + " ");

                                            retry_counter = BGWorkerData.MaxRetries;
                                        }
                                        catch
                                        {
                                        }
                                    }
                                }

                                else
                                {
                                    //report what we're working on
                                    StatusLabel.Invoke((MethodInvoker) delegate
                                    {
                                        StatusLabel.Text = "Status: " + ex.Error.Message;
                                        StatusLabel.Invalidate();
                                        StatusLabel.Update();
                                        StatusLabel.Refresh();
                                        Application.DoEvents();
                                    });
                                }
                            }
                        }



                        //open up the output file
                        using (StreamWriter outputFile = new StreamWriter(new FileStream(Output_Location, FileMode.Create), OutputSelectedEncoding))
                        {
                            outputFile.Write(TranslatedText_Output.ToString());
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show("Transmogrifier encountered an issue somewhere while trying to translate your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application? " +
                                "After clicking the \"OK\" Button, you will receive an error code. Please write down this error code (or take a screenshot) and contact the software's author ([email protected]) for additional help.", "Error while translating", MessageBoxButtons.OK, MessageBoxIcon.Error);

                MessageBox.Show(ex.ToString(), "Error Code", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Beispiel #20
0
        private void DetectSpeakersBGWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //set up our sentence boundary detection
            Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;
            bool     UsingRegex       = false;
            string   RegExString      = "";


            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
                RegExString      = RegexTextBox.Text;
            });

            Regex CompiledRegex = new Regex(RegExString, RegexOptions.Compiled);

            if (!string.IsNullOrEmpty(RegExString))
            {
                UsingRegex = true;
            }



            //make sure that we convert our max length to an integer



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth);


            //pull out the arguments and put them into more accessible variable names
            int    MaxTagLength    = int.Parse(((string[])e.Argument)[1]);
            string DelimiterString = ((string[])e.Argument)[2];
            int    DelimiterLength = DelimiterString.Length;

            HashSet <string> SpeakerList = new HashSet <string>();


            try
            {
                foreach (string fileName in files)
                {
                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);

                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                    });


                    //do stuff here
                    string[] readText_Lines = NewlineClean.Split(File.ReadAllText(fileName, SelectedEncoding));
                    int      NumberOfLines  = readText_Lines.Length;


                    //loop through all of the lines in each text
                    for (int i = 0; i < NumberOfLines; i++)
                    {
                        string CurrentLine = readText_Lines[i];

                        if (UsingRegex)
                        {
                            CurrentLine = CompiledRegex.Replace(CurrentLine, "").Trim();
                        }
                        else
                        {
                            CurrentLine = CurrentLine.Trim();
                        }


                        int IndexOfDelimiter = CurrentLine.IndexOf(DelimiterString);

                        if (IndexOfDelimiter > -1)
                        {
                            string SpeakerTag = CurrentLine.Substring(0, IndexOfDelimiter + DelimiterLength);

                            if ((SpeakerTag.Length <= MaxTagLength) && !SpeakerList.Contains(SpeakerTag))
                            {
                                SpeakerList.Add(SpeakerTag);
                            }
                        }


                        //end of for loop through each line
                    }



                    //end of for loop through each file
                }



                //end of try block
            }
            catch
            {
                MessageBox.Show("ConverSplitterPlus encountered an issue while opening / scanning your files.\r\n?Are your text files open in another program?");
            }

            e.Result = SpeakerList;
        }
Beispiel #21
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            BGWorkerData BGData = (BGWorkerData)e.Argument;

            BGData.NumberOfMatches   = new uint[BGData.RegexArray.Length];
            BGData.TotalFilesMatched = new uint[BGData.RegexArray.Length];

            for (int i = 0; i < BGData.NumberOfMatches.Length; i++)
            {
                BGData.NumberOfMatches[i]   = 0;
                BGData.TotalFilesMatched[i] = 0;
            }


            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(BGData.TextFileFolder, BGData.Filetype, SearchDepth);



            try {
                //we want to be conservative and limit the number of threads to the number of processors that we have
                var options = new ParallelOptions {
                    MaxDegreeOfParallelism = Environment.ProcessorCount
                };
                Parallel.ForEach(files, options, (string fileName) =>
                {
                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);

                    string SubDirStructure = Path.GetDirectoryName(fileName).Replace(BGData.TextFileFolder, "").TrimStart('\\');


                    //creates subdirs if they don't exist



                    string Output_Location = BGData.OutputFileLocation + '\\' + SubDirStructure;

                    if (!Directory.Exists(Output_Location))
                    {
                        Directory.CreateDirectory(Output_Location);
                    }

                    Output_Location = Path.Combine(Output_Location, Path.GetFileName(fileName));

                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Processing: " + Filename_Clean;
                        FilenameLabel.Invalidate();
                        FilenameLabel.Update();
                        FilenameLabel.Refresh();
                        Application.DoEvents();
                    });



                    //read in the text file, convert everything to lowercase
                    string readText = File.ReadAllText(fileName, SelectedEncoding);

                    if (BGData.CompactWhitespace)
                    {
                        readText = Regex.Replace(readText, @"\s+", " ");
                    }


                    //     _                _                 _____         _
                    //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                    //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                    //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                    // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                    //                        |___/

                    for (int i = 0; i < BGData.RegexArray.Length; i++)
                    {
                        int NumMatches = BGData.RegexArray[i].Matches(readText).Count;

                        if (NumMatches == 0)
                        {
                            continue;
                        }

                        BGData.NumberOfMatches[i]   += (uint)NumMatches;
                        BGData.TotalFilesMatched[i] += 1;

                        readText = BGData.RegexArray[i].Replace(readText, BGData.ReplacementArray[i]);
                    }



                    // __        __    _ _          ___        _               _
                    // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                    //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                    //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                    //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                    //                                            |_|

                    //open up the output file
                    using (StreamWriter outputFile = new StreamWriter(new FileStream(Output_Location, FileMode.Create), SelectedEncoding))
                    {
                        outputFile.Write(readText);
                    }
                });


                using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BGData.OutputFileLocation, "__TextEmend-Report.csv"), FileMode.Create), SelectedEncoding))
                {
                    outputFile.WriteLine("\"RegEx\",\"Replacement\",\"NumberOfMatches\",\"FilesWithPattern\"");

                    for (int i = 0; i < BGData.RegexArray.Length; i++)
                    {
                        outputFile.WriteLine("\"" + BGData.RegexArray[i].ToString() + "\"," +
                                             "\"" + BGData.ReplacementArray[i] + "\"," +
                                             BGData.NumberOfMatches[i].ToString() + "," +
                                             BGData.TotalFilesMatched[i].ToString());
                    }
                }
            }
            catch
            {
                MessageBox.Show("TextEmend encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application? Are you sure that your regular expressions are properly formed?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }