private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { Vocabulate.DictionaryData DictData = (Vocabulate.DictionaryData)e.Argument; TwitterAwareTokenizer Tokenizer = new TwitterAwareTokenizer(); Tokenizer.Initialize_Regex(); Vocabulate.StopWordRemover StopList = new Vocabulate.StopWordRemover(); StopList.BuildStopList(DictData.StopListRawText); //sets up how many columns we're using for output short OutputColumnsModifier = 2; if (DictData.RawWordCounts) { OutputColumnsModifier = 4; } short OutputCapturedText = 0; if (DictData.OutputCapturedText) { OutputCapturedText = 1; } //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth); string CSVQuote = DictData.CSVQuote.ToString(); string CSVDelimiter = DictData.CSVDelimiter.ToString(); try { //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding)) { short NumberOfHeaderLeadingColumns = 9; //write the header row to the output file StringBuilder HeaderString = new StringBuilder(); HeaderString.Append(CSVQuote + "Filename" + CSVQuote + CSVDelimiter + CSVQuote + "WC" + CSVQuote + CSVDelimiter + CSVQuote + "TC_Raw" + CSVQuote + CSVDelimiter + CSVQuote + "TTR_Raw" + CSVQuote + CSVDelimiter + CSVQuote + "TC_Clean" + CSVQuote + CSVDelimiter + CSVQuote + "TTR_Clean" + CSVQuote + CSVDelimiter + CSVQuote + "TC_NonDict" + CSVQuote + CSVDelimiter + CSVQuote + "TTR_NonDict" + CSVQuote + CSVDelimiter + CSVQuote + "DictPercent" + CSVQuote); //output headers for the Concept-constrained Concept-Word Ratio (CWR) for (int i = 0; i < DictData.NumCats; i++) { HeaderString.Append(CSVDelimiter + CSVQuote + DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_CWR" + CSVQuote); } //output headers for the Concept-Category Ratio (CCR) for (int i = 0; i < DictData.NumCats; i++) { HeaderString.Append(CSVDelimiter + CSVQuote + DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_CCR" + CSVQuote); } //if they want the raw category counts, then we add those to the header as well if (DictData.RawWordCounts) { for (int i = 0; i < DictData.NumCats; i++) { HeaderString.Append(CSVDelimiter + CSVQuote + DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_Count" + CSVQuote); } for (int i = 0; i < DictData.NumCats; i++) { HeaderString.Append(CSVDelimiter + CSVQuote + DictData.CatNames[i].Replace(CSVQuote, CSVQuote + CSVQuote) + "_Unique" + CSVQuote); } } if (DictData.OutputCapturedText) { HeaderString.Append(CSVDelimiter + CSVQuote + "CapturedText" + CSVQuote); } outputFile.WriteLine(HeaderString.ToString()); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); Dictionary <string, ulong> DictionaryResults = new Dictionary <string, ulong>(); foreach (string Concept in DictData.ConceptMap.Keys) { DictionaryResults.Add(Concept, 0); } //structure of DictionaryResults will look like this: //Concept -> Total //this will make it far easier to go through and calculate number of unique concepts divided by total number of words //at the top level categories down the road //for (int i = 0; i < DictData.NumCats; i++) DictionaryResults.Add(DictData.CatValues[i], 0); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //read in the text file, convert everything to lowercase string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower(); int NumberOfMatches = 0; int WordCount_WhitespaceTokenizer = Tokenizer.TokenizeWhitespace(readText.Trim()).Length; //splits everything out into words string[] Words = Tokenizer.tokenize(readText.Trim()); Words = StopList.ClearStopWords(Words); int TotalStringLength_BeforeStopList = Words.Length; double TTR_Raw = (Words.Distinct().Count() / (double)TotalStringLength_BeforeStopList) * 100; Words = Words.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); int TotalStringLength_AfterStopList = Words.Length; double TTR_Clean = (Words.Distinct().Count() / (double)TotalStringLength_AfterStopList) * 100; StringBuilder CapturedText = new StringBuilder(); List <string> NonmatchedTokens = new List <string>(); // _ _ _____ _ // / \ _ __ __ _| |_ _ _______ |_ _|____ _| |_ // / _ \ | '_ \ / _` | | | | |_ / _ \ | |/ _ \ \/ / __| // / ___ \| | | | (_| | | |_| |/ / __/ | | __/> <| |_ // /_/ \_\_| |_|\__,_|_|\__, /___\___| |_|\___/_/\_\\__| // |___/ //iterate over all words in the text file for (int i = 0; i < TotalStringLength_AfterStopList; i++) { bool TokenMatched = false; //iterate over n-grams, starting with the largest possible n-gram (derived from the user's dictionary file) for (int NumberOfWords = DictData.MaxWords; NumberOfWords > 0; NumberOfWords--) { //make sure that we don't overextend past the array if (i + NumberOfWords - 1 >= TotalStringLength_AfterStopList) { continue; } //make the target string string TargetString; if (NumberOfWords > 1) { TargetString = String.Join(" ", Words.Skip(i).Take(NumberOfWords).ToArray()); } else { TargetString = Words[i]; } //look for an exact match if (DictData.FullDictionaryMap["Standards"].ContainsKey(NumberOfWords)) { if (DictData.FullDictionaryMap["Standards"][NumberOfWords].ContainsKey(TargetString)) { //add in the number of words found NumberOfMatches += NumberOfWords; //increment results DictionaryResults[DictData.FullDictionaryMap["Standards"][NumberOfWords][TargetString]] += 1; //manually increment the for loop so that we're not testing on words that have already been picked up i += NumberOfWords - 1; //break out of the lower level for loop back to moving on to new words altogether TokenMatched = true; if (DictData.OutputCapturedText) { CapturedText.Append(TargetString.Replace(CSVQuote, CSVQuote + CSVQuote) + " "); } break; } } //if there isn't an exact match, we have to go through the wildcards if (DictData.WildCardArrays.ContainsKey(NumberOfWords)) { for (int j = 0; j < DictData.WildCardArrays[NumberOfWords].Length; j++) { if (DictData.PrecompiledWildcards[DictData.WildCardArrays[NumberOfWords][j]].Matches(TargetString).Count > 0) { //add in the number of words found NumberOfMatches += NumberOfWords; //increment results DictionaryResults[DictData.FullDictionaryMap["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]]] += 1; //manually increment the for loop so that we're not testing on words that have already been picked up i += NumberOfWords - 1; //break out of the lower level for loop back to moving on to new words altogether TokenMatched = true; if (DictData.OutputCapturedText) { CapturedText.Append(TargetString.Replace(CSVQuote, CSVQuote + CSVQuote) + " "); } break; } } } } //this is what we do if we didn't find any match in our dictionary if (!TokenMatched) { NonmatchedTokens.Add(Words[i]); } } // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| string[] OutputString = new string[NumberOfHeaderLeadingColumns + (DictData.NumCats * OutputColumnsModifier) + OutputCapturedText]; for (int i = 0; i < OutputString.Length; i++) { OutputString[i] = ""; } OutputString[0] = CSVQuote + Filename_Clean + CSVQuote; //filename OutputString[1] = WordCount_WhitespaceTokenizer.ToString(); //WordCount OutputString[2] = TotalStringLength_BeforeStopList.ToString(); //total number of words if (TotalStringLength_BeforeStopList > 0) { OutputString[3] = TTR_Raw.ToString(); //TTR_Raw } OutputString[4] = TotalStringLength_AfterStopList.ToString(); //total number of tokens after stoplist processing if (TotalStringLength_AfterStopList > 0) { OutputString[5] = TTR_Clean.ToString(); // TTR_Clean } OutputString[6] = (TotalStringLength_AfterStopList - NumberOfMatches).ToString(); //number of non-dictionary tokens if (NonmatchedTokens.Count() > 0) { OutputString[7] = (((double)NonmatchedTokens.Distinct().Count() / NonmatchedTokens.Count()) * 100).ToString(); //TTR for non-dictionary words } //calculate and output the results if (TotalStringLength_BeforeStopList > 0) { OutputString[8] = (((double)NumberOfMatches / TotalStringLength_BeforeStopList) * 100).ToString(); //dictpercent //pull together the results here Dictionary <string, ulong[]> CompiledResults = new Dictionary <string, ulong[]>(); foreach (string TopLevelCategory in DictData.CatNames) { CompiledResults.Add(TopLevelCategory, new ulong[2] { 0, 0 }); } foreach (string ConceptKey in DictData.ConceptMap.Keys) { if (DictionaryResults[ConceptKey] > 0) { for (int i = 0; i < DictData.ConceptMap[ConceptKey].Length; i++) { //if the Concept was found in the text, increment the first index (i.e., the number of unique concepts) by 1 CompiledResults[DictData.ConceptMap[ConceptKey][i]][0] += 1; //if the Concept was found in the text, add the number of times it occurred CompiledResults[DictData.ConceptMap[ConceptKey][i]][1] += DictionaryResults[ConceptKey]; } } } //this is where we actually calulate and output the CWR scores for (int i = 0; i < DictData.CategoryOrder.Count; i++) { if (WordCount_WhitespaceTokenizer > 0) { OutputString[i + NumberOfHeaderLeadingColumns] = (((double)CompiledResults[DictData.CategoryOrder[i]][0] / WordCount_WhitespaceTokenizer) * 100.0).ToString(); } } //this is where we actually calulate and output the CCR scores for (int i = 0; i < DictData.CategoryOrder.Count; i++) { if (CompiledResults[DictData.CategoryOrder[i]][0] > 0) { OutputString[i + NumberOfHeaderLeadingColumns + DictData.NumCats] = (((double)CompiledResults[DictData.CategoryOrder[i]][0] / CompiledResults[DictData.CategoryOrder[i]][1]) * 100.0).ToString(); } } //this is if the user asked for the raw counts per category if (DictData.RawWordCounts) { for (int i = 0; i < DictData.CategoryOrder.Count; i++) { OutputString[i + NumberOfHeaderLeadingColumns + (DictData.NumCats * 2)] = CompiledResults[DictData.CategoryOrder[i]][1].ToString(); OutputString[i + NumberOfHeaderLeadingColumns + (DictData.NumCats * 3)] = CompiledResults[DictData.CategoryOrder[i]][0].ToString(); } } } else { OutputString[3] = ""; for (int i = 0; i < DictData.NumCats; i++) { OutputString[i + NumberOfHeaderLeadingColumns] = ""; } } //if we're outputting the captured strings, we do that here if (DictData.OutputCapturedText) { OutputString[OutputString.Length - 1] = CSVQuote + CapturedText.ToString() + CSVQuote; } outputFile.WriteLine(String.Join(CSVDelimiter, OutputString)); } } } catch { MessageBox.Show("Vocabulate encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while Vocabulate is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { BackgroundWorkerData BGData = (BackgroundWorkerData)e.Argument; //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Loading model..."; }); //set up our sentence boundary detection Regex SentenceSplitter = new Regex(@"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", RegexOptions.Compiled); //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(BGData.TextFileFolder, "*.txt", SearchDepth); try { var tagger = new MaxentTagger(modelsDirectory + @"/" + BGData.SelectedModel); int NumberOfTagsInModel = tagger.numTags(); List <string> tags_list_header = new List <string>(); List <string> tags_list = new List <string>(); for (int i = 0; i < NumberOfTagsInModel; i++) { tags_list_header.Add("\"" + tagger.getTag(i) + "\""); tags_list.Add(tagger.getTag(i)); } tags_list_header.Sort(); tags_list.Sort(); string[] tags_array = tags_list.ToArray(); //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(BGData.OutputFileLocation, FileMode.Create), SelectedEncoding)) { //write the header row to the output file StringBuilder HeaderString = new StringBuilder(); HeaderString.Append("\"Filename\",\"Segment\",\"TokenCount\",\"SentenceCount\"," + string.Join(",", tags_list_header.ToArray())); if (BGData.OutputTaggedText) { HeaderString.Append(",\"TaggedText\""); } if (BGData.OrderedPOSTagText) { HeaderString.Append(",\"OrderedPOSTags\""); } outputFile.WriteLine(HeaderString.ToString()); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //read in the text file, convert everything to lowercase var InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim(); var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(InputText)).toArray(); //now that we know how many sentences we have, we can figure out the segmentation double SentencesPerSegment = 1.0; int NumberOfSegments = BGData.NumSegments; if (NumberOfSegments > sentences.Length) { NumberOfSegments = sentences.Length; } if (sentences.Length > 0) { SentencesPerSegment = sentences.Length / (double)NumberOfSegments; } List <List <ArrayList> > Sentences_Segmented = new List <List <ArrayList> >(); int SegmentCounter = 1; //int SentenceNumberTracker = 0; for (int i = 0; i < sentences.Length; i++) { if (Sentences_Segmented.Count < SegmentCounter) { Sentences_Segmented.Add(new List <ArrayList>()); } Sentences_Segmented[SegmentCounter - 1].Add((ArrayList)sentences[i]); //SentenceNumberTracker++; if (i + 1 >= SegmentCounter * SentencesPerSegment) { SegmentCounter++; //SentenceNumberTracker = 0; } } sentences = null; // _ _ _____ _ // / \ _ __ __ _| |_ _ _______ |_ _|____ _| |_ // / _ \ | '_ \ / _` | | | | |_ / _ \ | |/ _ \ \/ / __| // / ___ \| | | | (_| | | |_| |/ / __/ | | __/> <| |_ // /_/ \_\_| |_|\__,_|_|\__, /___\___| |_|\___/_/\_\\__| // |___/ for (int i = 0; i < NumberOfSegments; i++) { Dictionary <string, int> POSSums = new Dictionary <string, int>(); for (int j = 0; j < NumberOfTagsInModel; j++) { POSSums.Add(tags_array[j], 0); } StringBuilder TaggedText = new StringBuilder(); StringBuilder OrderedPOSTags = new StringBuilder(); int TotalSentences = Sentences_Segmented[i].Count; int TotalWC = 0; foreach (ArrayList sentence in Sentences_Segmented[i]) { var taggedSentence = tagger.tagSentence(sentence); Iterator it = taggedSentence.iterator(); while (it.hasNext()) { TaggedWord token = (TaggedWord)it.next(); if (BGData.OutputTaggedText) { TaggedText.Append(token.toString() + " "); } if (BGData.OrderedPOSTagText) { OrderedPOSTags.Append(token.tag() + " "); } POSSums[token.tag()] += 1; TotalWC += 1; //MessageBox.Show(token.word()); } TaggedText.Append(Environment.NewLine); OrderedPOSTags.Append(Environment.NewLine); } // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| string[] OutputString = new string[4]; OutputString[0] = "\"" + Filename_Clean + "\""; OutputString[1] = (i + 1).ToString(); OutputString[2] = TotalWC.ToString(); OutputString[3] = TotalSentences.ToString(); int include_tagged_text = 0; int include_ordered_pos = 0; if (BGData.OutputTaggedText) { include_tagged_text = 1; } if (BGData.OrderedPOSTagText) { include_ordered_pos = 1; } string[] TagOutputString = new string[NumberOfTagsInModel + include_tagged_text + include_ordered_pos]; for (int j = 0; j < NumberOfTagsInModel; j++) { if (BGData.NormalizeOutput && TotalWC > 0) { TagOutputString[j] = RoundUp(POSSums[tags_array[j]] * 100 / (double)TotalWC, 5).ToString(); } else { TagOutputString[j] = POSSums[tags_array[j]].ToString(); } } if (BGData.OutputTaggedText) { TagOutputString[TagOutputString.Length - include_tagged_text - include_ordered_pos] = "\"" + TaggedText.ToString().Replace("\"", "\"\"") + "\""; } if (BGData.OrderedPOSTagText) { TagOutputString[TagOutputString.Length - include_ordered_pos] = "\"" + OrderedPOSTags.ToString().Replace("\"", "\"\"") + "\""; } outputFile.WriteLine(String.Join(",", MergeOutputArrays(OutputString, TagOutputString))); } //end of the "for each file" loop } } } catch (OutOfMemoryException OOM) { MessageBox.Show("One or more of your files caused an Out of Memory error. This means that you do not have enough RAM to process the current file. This is often caused by extremely complex / messy language samples with run-on sentences or other peculiar constructions, paired with a computer that does not have enough RAM to handle such processing.", "Out of Memory", MessageBoxButtons.OK, MessageBoxIcon.Error); } catch { MessageBox.Show("POSTModern encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while POSTModern is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
// _____ _ _____ _ // / ____| | | | __ \ (_) // | | __ ___ _ __ ___ _ __ __ _| |_ ___ | |__) | __ _____ ___ _____ __ // | | |_ |/ _ \ '_ \ / _ \ '__/ _` | __/ _ \ | ___/ '__/ _ \ \ / / |/ _ \ \ /\ / / // | |__| | __/ | | | __/ | | (_| | || __/ | | | | | __/\ V /| | __/\ V V / // \_____|\___|_| |_|\___|_| \__,_|\__\___| |_| |_| \___| \_/ |_|\___| \_/\_/ // private void LoadCSVPreview_BGWorker_DoWork(object sender, DoWorkEventArgs e) { //here, we're basically unpacking and redefining all of the core information that was //passed to the background worker. it's a bit redundant and not super efficient, but the //loss of efficiency is more than made up for by the gains in readability BgWorkerInformation BgData = (BgWorkerInformation)e.Argument; Encoding SelectedEncoding = null; string InputFile = BgData.InputFile; bool HasHeaders = Convert.ToBoolean(BgData.HasHeaders); string[] Delimiters = BgData.Delimiters.ToCharArray().Select(c => c.ToString()).ToArray();; bool UsingQuotes = Convert.ToBoolean(BgData.UsingQuotes); this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); // a data table we'll use to hold the parsed data DataTable dt = new DataTable(); try { // create the parser using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding)) { // set the parser variables parser.TrimWhiteSpace = true; parser.TextFieldType = FieldType.Delimited; parser.SetDelimiters(Delimiters); parser.HasFieldsEnclosedInQuotes = UsingQuotes; int LineNumber = 0; bool firstLine = true; //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Preparing to read data file for preview..."; }); while (!parser.EndOfData) { //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Loading data file for preview... Data Row #" + LineNumber.ToString(); }); //Processing row string[] fields = parser.ReadFields(); LineNumber++; // get the column headers if (firstLine) { firstLine = false; if (HasHeaders) { foreach (var val in fields) { dt.Columns.Add(val); } LineNumber--; continue; } else { for (int i = 1; i <= fields.Length; i++) { dt.Columns.Add("v" + i.ToString()); } } } // get the row data dt.Rows.Add(fields); if (LineNumber > 999) { break; } } } e.Result = dt; if (dt.Columns.Count < 1 || dt.Rows.Count < 1) { MessageBox.Show("Your spreadsheet file could not be properly parsed" + "\r\n" + "with the current settings. WELP could not find any" + "\r\n" + "distinct columns and/or rows in your data file. This is" + "\r\n" + "most often caused by using the wrong delimiter(s).", "Data Parse Error", MessageBoxButtons.OK, MessageBoxIcon.Error); } } catch { //what to do if there's an error e.Result = false; } }
// ____ ______ __ _ ___ _ _ _ __ _ _ __ // | __ ) / ___\ \ / /__ _ __| | _____ _ __ / / | | | ___ __ ___ ___ _ | | (_)/ _| |_(_)_ __ __ \ \ // | _ \| | _ \ \ /\ / / _ \| '__| |/ / _ \ '__| | || |_| |/ _ \/ _` \ \ / / | | | | | | | |_| __| | '_ \ / _` | | // | |_) | |_| | \ V V / (_) | | | < __/ | | || _ | __/ (_| |\ V /| |_| | | |___| | _| |_| | | | | (_| | | // |____/ \____| \_/\_/ \___/|_| |_|\_\___|_| | ||_| |_|\___|\__,_| \_/ \__, | |_____|_|_| \__|_|_| |_|\__, | | // \_\ |___/ |___/_/ private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { //here, we're basically unpacking and redefining all of the core information that was //passed to the background worker. it's a bit redundant and not super efficient, but the //loss of efficiency is more than made up for by the gains in readability BgWorkerInformation BgData = (BgWorkerInformation)e.Argument; Encoding SelectedEncoding = null; string InputFile = BgData.InputFile; bool HasHeaders = Convert.ToBoolean(BgData.HasHeaders); string[] Delimiters = BgData.Delimiters.ToCharArray().Select(c => c.ToString()).ToArray();; bool UsingQuotes = Convert.ToBoolean(BgData.UsingQuotes); //initialize what we'll need later this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); ulong Total_Number_of_Tokens = Convert.ToUInt64(BgData.Tokens_Altogether.Count()); ulong number_of_word_lists = Convert.ToUInt64(BgData.Tokens.Count()); int vectorlength = BgData.EndingCol - BgData.StartingCol + 1; double[][] averagevector = new double[number_of_word_lists][]; for (ulong i = 0; i < number_of_word_lists; i++) { averagevector[i] = new double[vectorlength]; for (int j = 0; j < vectorlength; j++) { averagevector[i][j] = 0; } } try { // create the parser using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding)) { using (StreamWriter outputFile_subvectors = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_Subvectors.txt"), FileMode.Create, FileAccess.Write), SelectedEncoding)) { // set the parser properties parser.TrimWhiteSpace = true; //trim the whitespace to make sure that files/folder names don't end with a space, which will break the program parser.TextFieldType = FieldType.Delimited; parser.SetDelimiters(Delimiters); parser.HasFieldsEnclosedInQuotes = UsingQuotes; //this is used for header handling and reporting bool firstLine = true; ulong LineNumber = 0; ulong detected_tokens_altogether = 0; ulong[] detected_tokens_per_wordlist = new ulong[number_of_word_lists]; for (ulong i = 0; i < number_of_word_lists; i++) { detected_tokens_per_wordlist[i] = 0; } HashSet <string>[] Detected_Token_Hashset = new HashSet <string> [BgData.Tokens.Length]; for (int i = 0; i < BgData.Tokens.Length; i++) { Detected_Token_Hashset[i] = new HashSet <string>(); } //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Preparing..."; }); //Loop through each row of the dataset while (!parser.EndOfData && !BgWorker.CancellationPending) { //parse out the row string[] fields = parser.ReadFields(); LineNumber++; //report what row we're working on if (LineNumber % 100 == 0) { FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Getting average vector(s)... Currently reading row #" + LineNumber.ToString(); }); } // get the column headers if (firstLine) { firstLine = false; //essentially, if the first line of the dataset is headers, we'll just skip on to the next line if (HasHeaders) { LineNumber--; continue; } } //first, we want to know if the row even contains a token in our list: if (BgData.Tokens_Altogether.Contains(fields[BgData.TokenCol])) { detected_tokens_altogether++; //if it does, then we go in and figure out which word lists contain the word in //question, and do the basic "add word vectors" for each word list that contains it for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++) { if (BgData.Tokens[wordlist_counter].Contains(fields[BgData.TokenCol])) { Detected_Token_Hashset[wordlist_counter].Add(fields[BgData.TokenCol]); detected_tokens_per_wordlist[wordlist_counter]++; try { //copy just the vector into a new array string[] vector = new string[vectorlength]; Array.Copy(fields, BgData.StartingCol, vector, 0, vectorlength); double[] vector_numeric = Array.ConvertAll(vector, Double.Parse); outputFile_subvectors.WriteLine(fields[BgData.TokenCol] + "\t" + string.Join("\t", vector)); //add values from the new vector for (int i = 0; i < vectorlength; i++) { averagevector[wordlist_counter][i] += vector_numeric[i]; } } catch { DialogResult result = MessageBox.Show("There was an error reading your vectors." + "\r\n" + "Are you sure that you selected columns that only contain numbers?", "Vector parsing error", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Cancel = true; break; } } } } //if we've found all of the tokens, we don't need to keep looking if (detected_tokens_altogether == Total_Number_of_Tokens) { break; } if (e.Cancel) { break; } //end of while for going through data } //let user know if there was an issue with finding tokens if (detected_tokens_altogether == 0) { MessageBox.Show("None of the tokens in your list were found.", "No Tokens Found", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Cancel = true; } if (!e.Cancel) { //probably write a file of tokens that *were* captured StringBuilder tokens_found_output = new StringBuilder(); for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++) { //calculate the average vector //add values from the new vector for (int i = 0; i < vectorlength; i++) { averagevector[wordlist_counter][i] = averagevector[wordlist_counter][i] / detected_tokens_per_wordlist[wordlist_counter]; } string[] tokens_as_array = BgData.Tokens[wordlist_counter].ToArray(); List <string> UndetectedTokens = new List <string>(); //figure out which words were not caught for (int i = 0; i < tokens_as_array.Length; i++) { if (!Detected_Token_Hashset[wordlist_counter].Contains(tokens_as_array[i])) { UndetectedTokens.Add(tokens_as_array[i]); } } tokens_found_output.Append("\r\n------------------------------------------------\r\n" + "TOKENS FOUND, WORD GROUP #" + (wordlist_counter + 1).ToString() + ":" + "\r\n------------------------------------------------\r\n" + string.Join("\r\n", Detected_Token_Hashset[wordlist_counter])); tokens_found_output.Append("\r\n\r\n\r\n" + "\r\n------------------------------------------------\r\n" + "TOKENS NOT FOUND, WORD GROUP #" + (wordlist_counter + 1).ToString() + ":" + "\r\n------------------------------------------------\r\n" + string.Join("\r\n", UndetectedTokens) + "\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n"); } try { using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_AvgVector.txt"), FileMode.Create, FileAccess.Write), SelectedEncoding)) { for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++) { outputFile.WriteLine("Word_Group_" + (wordlist_counter + 1).ToString() + "\t" + string.Join("\t", averagevector[wordlist_counter])); } } using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_TokensFound.txt"), FileMode.Create, FileAccess.Write), SelectedEncoding)) { outputFile.Write(tokens_found_output); } } catch { DialogResult result = MessageBox.Show("There was an error writing your output.", "Write file error", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Cancel = true; } } //end "using" for retained vector output } //end of "using" textfieldparser } //end of try } catch { DialogResult result = MessageBox.Show("An error occurred somewhere while trying to parse your model file.", "General Error", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Cancel = true; } // ____ _ _ _ ____ _ ____ _ _ _ _ _ _ // / ___|__ _| | ___ _ _| | __ _| |_ ___ / ___|___ ___(_)_ __ ___ / ___|(_)_ __ ___ (_) | __ _ _ __(_) |_(_) ___ ___ // | | / _` | |/ __| | | | |/ _` | __/ _ \ | | / _ \/ __| | '_ \ / _ \ \___ \| | '_ ` _ \| | |/ _` | '__| | __| |/ _ \/ __| // | |__| (_| | | (__| |_| | | (_| | || __/ | |__| (_) \__ \ | | | | __/ ___) | | | | | | | | | (_| | | | | |_| | __/\__ \ // \____\__,_|_|\___|\__,_|_|\__,_|\__\___| \____\___/|___/_|_| |_|\___| |____/|_|_| |_| |_|_|_|\__,_|_| |_|\__|_|\___||___/ // try { if (!e.Cancel) { using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding)) { // set the parser properties parser.TrimWhiteSpace = true; //trim the whitespace to make sure that files/folder names don't end with a space, which will break the program parser.TextFieldType = FieldType.Delimited; parser.SetDelimiters(Delimiters); parser.HasFieldsEnclosedInQuotes = UsingQuotes; //this is used for header handling and reporting bool firstLine = true; ulong LineNumber = 0; using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BgData.OutputLocation, "_WELP_CosineSim.csv"), FileMode.Create, FileAccess.Write), SelectedEncoding)) { //write the header row string header = "\"Token\""; for (ulong i = 0; i < number_of_word_lists; i++) { header += ",\"Grp_" + (i + 1).ToString() + "_CosineSim\""; } outputFile.WriteLine(header); while (!parser.EndOfData && !BgWorker.CancellationPending) { //parse out the row string[] fields = parser.ReadFields(); LineNumber++; //report what row we're working on if (LineNumber % 100 == 0) { FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Calculating cosine similarities... Currently reading row #" + LineNumber.ToString(); }); } // get the column headers if (firstLine) { firstLine = false; //essentially, if the first line of the dataset is headers, we'll just skip on to the next line if (HasHeaders) { LineNumber--; continue; } } try { //if if's not the header row, then let's get the vector string[] vector = new string[vectorlength]; Array.Copy(fields, BgData.StartingCol, vector, 0, vectorlength); double[] vector_numeric = Array.ConvertAll(vector, Double.Parse); //let's calculate the cosine similarity between our mean vector //and the token on the current row //https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/ //Cosine Similarity (d1, d2) = Dot product(d1, d2) / ||d1|| * ||d2|| // //Dot product (d1,d2) = d1[0] * d2[0] + d1[1] * d2[1] * … * d1[n] * d2[n] //||d1|| = square root(d1[0]2 + d1[1]2 + ... + d1[n]2) //||d2|| = square root(d2[0]2 + d2[1]2 + ... + d2[n]2) bool at_least_one_cossim = false; double[] CosineSims = new double[number_of_word_lists]; for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++) { double dotproduct = 0; double d1 = 0; double d2 = 0; //calculate cosine similarity components for (int i = 0; i < vectorlength; i++) { dotproduct += averagevector[wordlist_counter][i] * vector_numeric[i]; d1 += averagevector[wordlist_counter][i] * averagevector[wordlist_counter][i]; d2 += vector_numeric[i] * vector_numeric[i]; } CosineSims[wordlist_counter] = dotproduct / (Math.Sqrt(d1) * Math.Sqrt(d2)); if (Math.Abs(CosineSims[wordlist_counter]) > BgData.OmitBelowValue) { at_least_one_cossim = true; } } if (BgData.OmitBelowValue == 0.0 || at_least_one_cossim) { StringBuilder LineToWrite = new StringBuilder(); //write the output, making sure to escape quotes if (fields[BgData.TokenCol].Contains('"')) { LineToWrite.Append("\"" + fields[BgData.TokenCol].Replace("\"", "\"\"") + "\""); } else { LineToWrite.Append("\"" + fields[BgData.TokenCol] + "\""); } for (ulong wordlist_counter = 0; wordlist_counter < number_of_word_lists; wordlist_counter++) { if (BgData.OmitBelowValue == 0.0 || Math.Abs(CosineSims[wordlist_counter]) > BgData.OmitBelowValue) { LineToWrite.Append("," + CosineSims[wordlist_counter]); } else { LineToWrite.Append(","); } } outputFile.WriteLine(LineToWrite); } } catch { DialogResult result = MessageBox.Show("There was an error reading your vectors." + "\r\n" + "Are you sure that you selected columns that only contain numbers?", "Vector parsing error", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Cancel = true; break; } //end while } //end "using" for file output } //end "using" for textfieldparser } //end of "if e.cancel is false" } //end of try } catch { DialogResult result = MessageBox.Show("An error occurred somewhere while trying to calculate similarities.", "General Error", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Cancel = true; } }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { DictionaryData BGWorkerData = (DictionaryData)e.Argument; TranslationClient client = TranslationClient.Create(); //selects the text encoding based on user selection Encoding InputSelectedEncoding = null; Encoding OutputSelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { InputSelectedEncoding = Encoding.GetEncoding(InputEncodingDropdown.SelectedItem.ToString()); OutputSelectedEncoding = Encoding.GetEncoding(OutputEncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(BGWorkerData.TextFileFolder, BGWorkerData.FileExtension, SearchDepth); try { foreach (string fileName in files) { if (e.Cancel) { break; } //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); string SubDirStructure = Path.GetDirectoryName(fileName).Replace(BGWorkerData.TextFileFolder, "").TrimStart('\\'); //creates subdirs if they don't exist string Output_Location = BGWorkerData.OutputFileLocation + '\\' + SubDirStructure; if (!Directory.Exists(Output_Location)) { Directory.CreateDirectory(Output_Location); } Output_Location = Path.Combine(Output_Location, Path.GetFileName(fileName)); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Processing: " + Filename_Clean; FilenameLabel.Invalidate(); FilenameLabel.Update(); FilenameLabel.Refresh(); Application.DoEvents(); }); // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| using (StreamReader inputfile = new StreamReader(fileName, InputSelectedEncoding)) { if (e.Cancel) { break; } string readText = inputfile.ReadToEnd(); string[] readText_Chunked = new string[0]; if (!string.IsNullOrWhiteSpace(readText)) { readText_Chunked = SplitStringByLength(readText, BGWorkerData.MaxCharsPerRequest); } StringBuilder TranslatedText_Output = new StringBuilder(); for (int i = 0; i < readText_Chunked.Length; i++) { if (e.Cancel) { break; } try { if (e.Cancel) { break; } StatusLabel.Invoke((MethodInvoker) delegate { StatusLabel.Text = "Status: Sending request " + (i + 1).ToString() + "/" + readText_Chunked.Length.ToString() + " to API..."; StatusLabel.Invalidate(); StatusLabel.Update(); StatusLabel.Refresh(); Application.DoEvents(); }); var response = client.TranslateText(readText_Chunked[i], sourceLanguage: BGWorkerData.InputLang, targetLanguage: BGWorkerData.OutputLang); TranslatedText_Output.Append(response.TranslatedText + " "); } catch (Google.GoogleApiException ex) { if (e.Cancel) { break; } if (ex.Error.Code == 403) { if (ex.Error.Message.Contains("Daily Limit Exceeded")) { //report what we're working on StatusLabel.Invoke((MethodInvoker) delegate { StatusLabel.Text = "Status: " + ex.Error.Message; StatusLabel.Invalidate(); StatusLabel.Update(); StatusLabel.Refresh(); Application.DoEvents(); }); MessageBox.Show("The Google Translate API reports that you have exceeded your daily use limit. You will need to visit the \"Quotas\" section of the Google Cloud Dashboard to increase your limits or, alternatively, wait until midnight for your quota to reset.", "Daily Limit Exceeded", MessageBoxButtons.OK, MessageBoxIcon.Stop); e.Cancel = true; break; } else { if (e.Cancel) { break; } int retry_counter = 0; while (retry_counter < BGWorkerData.MaxRetries) { retry_counter++; int TimerCounter = 0; DateTime d = DateTime.Now; while (TimerCounter < BGWorkerData.DurationLength + 2) { TimeSpan ts = DateTime.Now.Subtract(d); if (ts.Seconds >= 1) { //do some work TimerCounter += ts.Seconds; d = DateTime.Now; //report what we're working on StatusLabel.Invoke((MethodInvoker) delegate { StatusLabel.Text = "Status: Rate limit reached. Sleeping for " + (BGWorkerData.DurationLength - TimerCounter + 1).ToString() + "..."; StatusLabel.Invalidate(); StatusLabel.Update(); StatusLabel.Refresh(); Application.DoEvents(); }); } } try { //report what we're working on StatusLabel.Invoke((MethodInvoker) delegate { StatusLabel.Text = "Status: Sending request " + (i + 1).ToString() + "/" + readText_Chunked.Length.ToString() + " to API... Retry #" + retry_counter.ToString(); StatusLabel.Invalidate(); StatusLabel.Update(); StatusLabel.Refresh(); Application.DoEvents(); }); var response = client.TranslateText(readText_Chunked[i], sourceLanguage: BGWorkerData.InputLang, targetLanguage: BGWorkerData.OutputLang); TranslatedText_Output.Append(response.TranslatedText + " "); retry_counter = BGWorkerData.MaxRetries; } catch { } } } } else if (ex.Error.Code == 429 || (ex.Error.Code >= 500 && ex.Error.Code < 600)) { int retry_counter = 0; while (retry_counter < BGWorkerData.MaxRetries) { retry_counter++; int TimerCounter = 0; DateTime d = DateTime.Now; while (TimerCounter < System.Math.Pow(retry_counter, 2)) { TimeSpan ts = DateTime.Now.Subtract(d); if (ts.Seconds >= 1) { //do some work TimerCounter += ts.Seconds; d = DateTime.Now; //report what we're working on StatusLabel.Invoke((MethodInvoker) delegate { StatusLabel.Text = "Status: Error " + ex.Error.Code.ToString() + "; " + ex.Error.Message + " -- Retrying in " + (BGWorkerData.DurationLength - TimerCounter + 1).ToString() + "..."; StatusLabel.Invalidate(); StatusLabel.Update(); StatusLabel.Refresh(); Application.DoEvents(); }); } } try { //report what we're working on StatusLabel.Invoke((MethodInvoker) delegate { StatusLabel.Text = "Status: Sending request " + (i + 1).ToString() + "/" + readText_Chunked.Length.ToString() + " to API... Retry #" + retry_counter.ToString(); StatusLabel.Invalidate(); StatusLabel.Update(); StatusLabel.Refresh(); Application.DoEvents(); }); var response = client.TranslateText(readText_Chunked[i], sourceLanguage: BGWorkerData.InputLang, targetLanguage: BGWorkerData.OutputLang); TranslatedText_Output.Append(response.TranslatedText + " "); retry_counter = BGWorkerData.MaxRetries; } catch { } } } else { //report what we're working on StatusLabel.Invoke((MethodInvoker) delegate { StatusLabel.Text = "Status: " + ex.Error.Message; StatusLabel.Invalidate(); StatusLabel.Update(); StatusLabel.Refresh(); Application.DoEvents(); }); } } } //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(Output_Location, FileMode.Create), OutputSelectedEncoding)) { outputFile.Write(TranslatedText_Output.ToString()); } } } } catch (Exception ex) { MessageBox.Show("Transmogrifier encountered an issue somewhere while trying to translate your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application? " + "After clicking the \"OK\" Button, you will receive an error code. Please write down this error code (or take a screenshot) and contact the software's author ([email protected]) for additional help.", "Error while translating", MessageBoxButtons.OK, MessageBoxIcon.Error); MessageBox.Show(ex.ToString(), "Error Code", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
// __ __ _ _ ____ _ _ ______ _ _ // \ \ / / (_) | / __ \ | | | | | ____(_) | // \ \ /\ / / __ _| |_ ___ | | | |_ _| |_ _ __ _ _| |_ | |__ _| | ___ ___ // \ \/ \/ / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| | __| | | |/ _ \/ __| // \ /\ /| | | | || __/ | |__| | |_| | |_| |_) | |_| | |_ | | | | | __/\__ \ // \/ \/ |_| |_|\__\___| \____/ \__,_|\__| .__/ \__,_|\__| |_| |_|_|\___||___/ // | | // |_| private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { //here, we're basically unpacking and redefining all of the core information that was //passed to the background worker. it's a bit redundant and not super efficient, but the //loss of efficiency is more than made up for by the gains in readability BgWorkerInformation BgData = (BgWorkerInformation)e.Argument; Encoding SelectedEncoding = null; string InputFile = BgData.InputFile; bool HasHeaders = Convert.ToBoolean(BgData.HasHeaders); string[] Delimiters = BgData.Delimiters.ToCharArray().Select(c => c.ToString()).ToArray();; bool UsingQuotes = Convert.ToBoolean(BgData.UsingQuotes); bool DumpOutputAsTXT = false; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); DumpOutputAsTXT = DumpAsTextCheckbox.Checked; }); string OutputFile = BgData.OutputLocation + Path.DirectorySeparatorChar + "_SLIM_" + Path.GetFileName(InputFile); if (DumpOutputAsTXT) { OutputFile += ".txt"; } try { // create the parser using (TextFieldParser parser = new TextFieldParser(InputFile, SelectedEncoding)) { // set the parser properties parser.TrimWhiteSpace = true; //trim the whitespace to make sure that files/folder names don't end with a space, which will break the program parser.TextFieldType = FieldType.Delimited; parser.SetDelimiters(Delimiters); parser.HasFieldsEnclosedInQuotes = UsingQuotes; bool firstLine = true; ulong LineNumber = 0; ulong FileNumber = 0; ulong LastFileNumberforFolderCreation = 0; ulong FolderNumber = 0; //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Preparing to write output files..."; }); using (FileStream fileStream = new FileStream(OutputFile, FileMode.Create, FileAccess.Write, FileShare.Read)) using (StreamWriter streamWriter = new StreamWriter(fileStream, SelectedEncoding)) { //Loop through each row of the dataset while (!parser.EndOfData && !BgWorker.CancellationPending) { //parse out the row string[] fields = parser.ReadFields(); LineNumber++; //report what row we're working on if (LineNumber % 10 == 0) { FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Currently writing row #" + LineNumber.ToString(); }); } //prepare our output to write string[] output_array = new string[BgData.NumberOfColumns]; for (int i = 0; i < BgData.NumberOfColumns; i++) { if (UsingQuotes && DumpOutputAsTXT == false) { output_array[i] = '"' + fields[BgData.KeepCols[i]].Replace("\"", "\"\"") + '"'; } else { output_array[i] = fields[BgData.KeepCols[i]]; } } if (DumpOutputAsTXT) { streamWriter.WriteLine(string.Join("\r\n", output_array)); } else { streamWriter.WriteLine(string.Join(Delimiters[0], output_array)); } //write our output if (e.Cancel) { break; } } } } e.Result = null; } catch { MessageBox.Show("SlimCSV has encountered an error while processing your file.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "error"; } }
private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { uint WordWindowSize = 100; uint MaxPhraseLength = 3; uint BigWordSize = 6; //set up our sentence boundary detection Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled); //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); WordWindowSize = Convert.ToUInt32(WordWindowSizeTextbox.Text); MaxPhraseLength = Convert.ToUInt32(PhraseLengthTextbox.Text); BigWordSize = Convert.ToUInt32(BigWordTextBox.Text); }); if (WordWindowSize < 2) { WordWindowSize = 2; } if (MaxPhraseLength > WordWindowSize - 1) { MaxPhraseLength = WordWindowSize - 1; } if (MaxPhraseLength < 1) { MaxPhraseLength = 1; } //the very first thing that we want to do is set up our function word lists List <string> FunctionWordWildcardList = new List <string>(); List <string> FunctionWordsToHash = new List <string>(); string[] OriginalFunctionWordList = NewlineClean.Split(FunctionWordTextBox.Text.ToLower()); OriginalFunctionWordList = OriginalFunctionWordList.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); foreach (string Word in OriginalFunctionWordList) { string WordToParse = Word.Trim(); if (WordToParse.Contains('*')) { FunctionWordWildcardList.Add(WordToParse.Replace("*", "")); } else { FunctionWordsToHash.Add(WordToParse); } } //remove duplicates FunctionWordWildcardList = FunctionWordWildcardList.Distinct().ToList(); FunctionWordsToHash = FunctionWordsToHash.Distinct().ToList(); HashSet <string> HashedFuncWords = new HashSet <string>(FunctionWordsToHash); string[] FunctionWordWildCards = FunctionWordWildcardList.ToArray(); FunctionWordsToHash = null; FunctionWordWildcardList = null; //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth); try { using (StreamWriter outputFile = new StreamWriter(((string[])e.Argument)[1])) { string HeaderString = "\"Filename\",\"WC\",\"BigWordPercent\",\"AvgUniqueWPWindow\",\"Overall_Repeat_1word\",\"Funct_Repeat_1word\",\"Content_Repeat_1word\",\"BigWordRepeat\""; for (ushort i = 2; i <= MaxPhraseLength; i += 1) { HeaderString += ",\"Overall_Repeat_" + i.ToString() + "word\""; } outputFile.WriteLine(HeaderString); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); int TotalNumberOfWords = 0; double AvgUniqueWPWindow = 0; double TotalRepetition = 0.0; //double AvgWPS = 0.0; double FunctionWordRepetition = 0.0; double ContentWordRepetition = 0.0; double SixLtrWordRepetition = 0; ulong SixLtrWordsTotal = 0; //sets up our word phrase dictionaries Dictionary <int, double> PhraseDict = new Dictionary <int, double>(); for (ushort i = 2; i <= MaxPhraseLength; i += 1) { PhraseDict.Add(i, 0.0); } //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //do stuff here string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower(); readText = NewlineClean.Replace(readText, " "); //remove all the junk punctuation foreach (char c in PunctuationBox.Text) { readText = readText.Replace(c, ' '); } //splits everything out into words string[] Words = readText.Trim().Split(' '); Words = Words.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); for (int i = 0; i < Words.Length; i++) { if (Words[i].Length > BigWordSize - 1) { SixLtrWordsTotal++; } } TotalNumberOfWords += Words.Count(); UInt64 ContentWordsDenominator = 0; UInt64 FunctionWordsDenominator = 0; UInt64 WordWindowIterations = 0; //make sure that the text is at least long enough to analyze if (TotalNumberOfWords >= WordWindowSize) { //this is where we make a moving window for (uint BigCounter = 0; BigCounter <= (Words.Length - WordWindowSize); BigCounter += 1) { WordWindowIterations += 1; var WordWindow = new string[WordWindowSize]; Array.Copy(Words, BigCounter, WordWindow, 0, WordWindowSize); //do our full phrase repetition measures for (int i = 2; i <= MaxPhraseLength; i += 1) { var PhraseWindow = new string[WordWindowSize - (i - 1)]; for (int j = 0; j <= (WordWindowSize - i); j += 1) { string[] temp_phrase = new string[i]; Array.Copy(Words, j, temp_phrase, 0, i); PhraseWindow[j] = String.Join(" ", temp_phrase); } //add in the unique phrase percentage PhraseDict[i] += PhraseWindow.Distinct().ToArray().Length / ((double)WordWindowSize - (i - 1)); } //AvgWPS += Words.Count(); AvgUniqueWPWindow += WordWindow.Distinct().ToArray().Length; TotalRepetition += WordWindow.Distinct().ToArray().Length / (double)WordWindowSize; //now we go through and redo the same thing, separately for function words and content words //the first thing that we need to do is separate out the function words from the content words List <string> FunctionWords = new List <string>(); List <string> ContentWords = new List <string>(); List <string> SixLtrWords = new List <string>(); for (int i = 0; i < WordWindow.Length; i++) { //check the length of the word if (WordWindow[i].Length > BigWordSize - 1) { SixLtrWords.Add(WordWindow[i]); } //first, check with the hashset if (HashedFuncWords.Contains(WordWindow[i])) { FunctionWords.Add(WordWindow[i]); continue; } //if it wasn't found in the hashset, we'll loop through the wildcard function words for (int j = 0; j < FunctionWordWildCards.Count(); j++) { if (WordWindow[i].StartsWith(FunctionWordWildCards[j])) { FunctionWords.Add(WordWindow[i]); continue; } } //if we haven't moved on to the next word yet, then this is a content word ContentWords.Add(WordWindow[i]); } if (ContentWords.Count() > 0) { ContentWordRepetition += ContentWords.Distinct().ToArray().Length / (double)ContentWords.Count(); ContentWordsDenominator += 1; } if (FunctionWords.Count() > 0) { FunctionWordRepetition += FunctionWords.Distinct().ToArray().Length / (double)FunctionWords.Count(); FunctionWordsDenominator += 1; } if (SixLtrWords.Count() > 0) { SixLtrWordRepetition += SixLtrWords.Distinct().ToArray().Length / (double)SixLtrWords.Count(); } } } //divide everything by the number of sentences TotalRepetition = (float)TotalRepetition / (TotalNumberOfWords - (WordWindowSize - 1)); FunctionWordRepetition = (float)FunctionWordRepetition / FunctionWordsDenominator; ContentWordRepetition = (float)ContentWordRepetition / ContentWordsDenominator; SixLtrWordRepetition = (float)SixLtrWordRepetition / (TotalNumberOfWords - (WordWindowSize - 1)); AvgUniqueWPWindow = (float)AvgUniqueWPWindow / (TotalNumberOfWords - (WordWindowSize - 1)); if (TotalNumberOfWords >= WordWindowSize) { string[] OutputString = new string[8 + MaxPhraseLength - 1]; OutputString[0] = '"' + Filename_Clean + '"'; OutputString[1] = TotalNumberOfWords.ToString(); OutputString[2] = Math.Round((SixLtrWordsTotal / (double)TotalNumberOfWords) * 100, 3).ToString(); OutputString[3] = Math.Round(AvgUniqueWPWindow, 3).ToString(); OutputString[4] = Math.Round((1 - TotalRepetition) * 100, 3).ToString(); OutputString[5] = Math.Round((1 - FunctionWordRepetition) * 100, 3).ToString(); OutputString[6] = Math.Round((1 - ContentWordRepetition) * 100, 3).ToString(); OutputString[7] = Math.Round((1 - SixLtrWordRepetition) * 100, 3).ToString(); for (int i = 0; i < MaxPhraseLength - 1; i += 1) { OutputString[8 + i] = Math.Round((1 - (PhraseDict[i + 2] / ((float)TotalNumberOfWords - (WordWindowSize - 1)))) * 100, 3).ToString(); } outputFile.WriteLine(String.Join(",", OutputString)); } else { outputFile.WriteLine('"' + Filename_Clean + '"' + "," + TotalNumberOfWords.ToString()); } } } } catch { MessageBox.Show("Repeatalizer could not open your output file\r\nfor writing. Is the file open in another application?"); } }
private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth); try { string outputdir = Path.Combine(((string[])e.Argument)[1]); Directory.CreateDirectory(outputdir); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //do stuff here string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower(); var TokenResults = TwitterKoreanProcessorCS.Tokenize(readText); StringBuilder Builder = new StringBuilder(); int tokenCount = TokenResults.Count(); for (int i = 0; i < tokenCount; i++) { if (TokenResults.ElementAt(i).Pos != KoreanPos.Space) { Builder.Append(TokenResults.ElementAt(i).Text + ' '); } } using (System.IO.StreamWriter fileout = new StreamWriter(Path.Combine(outputdir, Filename_Clean), false, SelectedEncoding)) { fileout.Write(Builder.ToString()); } } } catch { MessageBox.Show("KoToken encountered a problem while trying to tokenize/write a file."); } }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { DictionaryData DictData = (DictionaryData)e.Argument; //set up our sentence boundary detection Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled); //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth); try { //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding)) { //write the header row to the output file StringBuilder HeaderString = new StringBuilder(); HeaderString.Append("\"Filename\",\"WC\",\"DictPercent\""); for (int i = 0; i < DictData.NumCats; i++) { HeaderString.Append("," + DictData.CatNames[i].Replace("\"", "\"\"")); } outputFile.WriteLine(HeaderString.ToString()); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); Dictionary <string, int> DictionaryResults = new Dictionary <string, int>(); for (int i = 0; i < DictData.NumCats; i++) { DictionaryResults.Add(DictData.CatValues[i], 0); } //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //read in the text file, convert everything to lowercase string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower(); readText = NewlineClean.Replace(readText, " "); //remove all the junk punctuation foreach (char c in PunctuationBox.Text) { readText = readText.Replace(c, ' '); } int NumberOfMatches = 0; //splits everything out into words //we're splitting on spaces here principally because we leave it up to the //user to decide what characters they want to remove. we're assuming that //they have removed tabs already (as is set up by default) string[] Words = readText.Trim().Split(' '); Words = Words.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); int TotalStringLength = Words.Length; // _ _ _____ _ // / \ _ __ __ _| |_ _ _______ |_ _|____ _| |_ // / _ \ | '_ \ / _` | | | | |_ / _ \ | |/ _ \ \/ / __| // / ___ \| | | | (_| | | |_| |/ / __/ | | __/> <| |_ // /_/ \_\_| |_|\__,_|_|\__, /___\___| |_|\___/_/\_\\__| // |___/ //iterate over all words in the text file for (int i = 0; i < TotalStringLength; i++) { //iterate over n-grams, starting with the largest possible n-gram (derived from the user's dictionary file) for (int NumberOfWords = DictData.MaxWords; NumberOfWords > 0; NumberOfWords--) { //make sure that we don't overextend past the array if (i + NumberOfWords - 1 >= TotalStringLength) { continue; } //make the target string string TargetString; if (NumberOfWords > 1) { TargetString = String.Join(" ", Words.Skip(i).Take(NumberOfWords).ToArray()); } else { TargetString = Words[i]; } //look for an exact match if (DictData.FullDictionary["Standards"].ContainsKey(NumberOfWords)) { if (DictData.FullDictionary["Standards"][NumberOfWords].ContainsKey(TargetString)) { NumberOfMatches += NumberOfWords; //add in the number of words found for (int j = 0; j < DictData.FullDictionary["Standards"][NumberOfWords][TargetString].Length; j++) { if (DictionaryResults.ContainsKey(DictData.FullDictionary["Standards"][NumberOfWords][TargetString][j])) { DictionaryResults[DictData.FullDictionary["Standards"][NumberOfWords][TargetString][j]] += NumberOfWords; } } //manually increment the for loop so that we're not testing on words that have already been picked up i += NumberOfWords - 1; //break out of the lower level for loop back to moving on to new words altogether break; } } //if there isn't an exact match, we have to go through the wildcards if (DictData.WildCardArrays.ContainsKey(NumberOfWords)) { for (int j = 0; j < DictData.WildCardArrays[NumberOfWords].Length; j++) { if (DictData.PrecompiledWildcards[DictData.WildCardArrays[NumberOfWords][j]].Matches(TargetString).Count > 0) { NumberOfMatches += NumberOfWords; for (int k = 0; k < DictData.FullDictionary["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]].Length; k++) { if (DictionaryResults.ContainsKey(DictData.FullDictionary["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]][k])) { DictionaryResults[DictData.FullDictionary["Wildcards"][NumberOfWords][DictData.WildCardArrays[NumberOfWords][j]][k]] += NumberOfWords; } } //manually increment the for loop so that we're not testing on words that have already been picked up i += NumberOfWords - 1; //break out of the lower level for loop back to moving on to new words altogether break; } } } } } // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| string[] OutputString = new string[3 + DictData.NumCats]; OutputString[0] = "\"" + Filename_Clean + "\""; OutputString[1] = TotalStringLength.ToString(); if (TotalStringLength > 0) { OutputString[2] = (((double)NumberOfMatches / TotalStringLength) * 100).ToString(); if (DictData.RawWordCounts) { for (int i = 0; i < DictData.NumCats; i++) { OutputString[i + 3] = DictionaryResults[DictData.CatValues[i]].ToString(); } } else { for (int i = 0; i < DictData.NumCats; i++) { OutputString[i + 3] = (((double)DictionaryResults[DictData.CatValues[i]] / TotalStringLength) * 100).ToString(); } } } else { OutputString[2] = ""; for (int i = 0; i < DictData.NumCats; i++) { OutputString[i + 3] = ""; } } outputFile.WriteLine(String.Join(",", OutputString)); } } } catch { MessageBox.Show("RIOTLite encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while RIOTLite is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { DictionaryData DictData = (DictionaryData)e.Argument; //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Loading CoreNLP models... please wait..."; }); //largely taken from here: https://github.com/sergey-tihon/Stanford.NLP.NET/issues/39 var jarRoot = @"stanford-corenlp-full-2018-02-27\"; var props = new java.util.Properties(); props.setProperty("annotators", "tokenize, ssplit, parse, sentiment"); props.setProperty("sutime.binders", "0"); var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(Path.Combine(Path.GetDirectoryName(AppDomain.CurrentDomain.BaseDirectory), jarRoot)); var pipeline = new StanfordCoreNLP(props); //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth); //try //{ //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding)) { using (StreamWriter outputFileSentences = new StreamWriter(new FileStream(AddSuffix(DictData.OutputFileLocation, "_Sentences"), FileMode.Create), SelectedEncoding)) { //write the header row to the output file StringBuilder HeaderString = new StringBuilder(); HeaderString.Append("\"Filename\",\"Sentences\",\"Classification\",\"Classification_M\",\"Classification_SD\""); outputFile.WriteLine(HeaderString.ToString()); StringBuilder HeaderStringSentence = new StringBuilder(); HeaderStringSentence.Append("\"Filename\",\"SentNumber\",\"SentenceText\",\"Classification\",\"Class_Prob\",\"Class_Number\""); outputFileSentences.WriteLine(HeaderStringSentence.ToString()); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); Dictionary <string, int> DictionaryResults = new Dictionary <string, int>(); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //read in the text file, convert everything to lowercase string InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim(); // _ _ _____ _ // / \ _ __ __ _| |_ _ _______ |_ _|____ _| |_ // / _ \ | '_ \ / _` | | | | |_ / _ \ | |/ _ \ \/ / __| // / ___ \| | | | (_| | | |_| |/ / __/ | | __/> <| |_ // /_/ \_\_| |_|\__,_|_|\__, /___\___| |_|\___/_/\_\\__| // |___/ var annotation = new edu.stanford.nlp.pipeline.Annotation(InputText); pipeline.annotate(annotation); List <double> SentimentValues = new List <double>(); var sentences = annotation.get(new CoreAnnotations.SentencesAnnotation().getClass()) as ArrayList; int SentenceCount = 0; foreach (CoreMap sentence in sentences) { SentenceCount++; Tree tree = sentence.get(new SentimentCoreAnnotations.SentimentAnnotatedTree().getClass()) as Tree; //add this sentence to our overall list of sentiment scores SentimentValues.Add(RNNCoreAnnotations.getPredictedClass(tree)); // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| string[] OutputString_SentenceLevel = new string[6]; string Classification = GetClassification((double)RNNCoreAnnotations.getPredictedClass(tree)); OutputString_SentenceLevel[0] = "\"" + Filename_Clean + "\""; OutputString_SentenceLevel[1] = SentenceCount.ToString(); OutputString_SentenceLevel[2] = "\"" + sentence.ToString().Replace("\"", "\"\"") + "\""; OutputString_SentenceLevel[3] = Classification; OutputString_SentenceLevel[4] = RNNCoreAnnotations.getPredictedClassProb(tree.label()).ToString(); OutputString_SentenceLevel[5] = RNNCoreAnnotations.getPredictedClass(tree).ToString(); outputFileSentences.WriteLine(String.Join(",", OutputString_SentenceLevel)); } //write output at the file level string[] OutputString = new string[5]; OutputString[0] = "\"" + Filename_Clean + "\""; OutputString[1] = SentenceCount.ToString(); OutputString[2] = GetClassification(SentimentValues.Average()); OutputString[3] = SentimentValues.Average().ToString(); OutputString[4] = StandardDeviation(SentimentValues).ToString(); outputFile.WriteLine(String.Join(",", OutputString)); } //this is the closing bracket for the sentence-level "using" filestream } //this is the closing bracket for the document-level "using" filestream } //} //catch //{ // MessageBox.Show("Senti-Gent encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while Senti-Gent is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error); //} }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { DictionaryData DictData = (DictionaryData)e.Argument; SentimentIntensityAnalyzer VADER = new SentimentIntensityAnalyzer(); //set up our sentence boundary detection Regex SentenceSplitter = new Regex(@"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", RegexOptions.Compiled); //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth); try { //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding)) { using (StreamWriter outputFileSentences = new StreamWriter(new FileStream(AddSuffix(DictData.OutputFileLocation, "_Sentences"), FileMode.Create), SelectedEncoding)) { //write the header row to the output file StringBuilder HeaderString = new StringBuilder(); HeaderString.Append("\"Filename\",\"WC\",\"Sentences\",\"Classification\",\"Compound_M\",\"Positive_M\",\"Negative_M\",\"Neutral_M\""); outputFile.WriteLine(HeaderString.ToString()); StringBuilder HeaderStringSentence = new StringBuilder(); HeaderStringSentence.Append("\"Filename\",\"WC\",\"Sentence\",\"Classification\",\"Compound_M\",\"Positive_M\",\"Negative_M\",\"Neutral_M\""); outputFileSentences.WriteLine(HeaderStringSentence.ToString()); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); Dictionary <string, int> DictionaryResults = new Dictionary <string, int>(); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //read in the text file, convert everything to lowercase string InputText = File.ReadAllText(fileName, SelectedEncoding).Trim(); string[] Sentences = SentenceSplitter.Split(InputText).Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); int TotalStringLength = InputText.Split().Where(x => !string.IsNullOrWhiteSpace(x)).ToArray().Length; int TotalSentences = Sentences.Length; // _ _ _____ _ // / \ _ __ __ _| |_ _ _______ |_ _|____ _| |_ // / _ \ | '_ \ / _` | | | | |_ / _ \ | |/ _ \ \/ / __| // / ___ \| | | | (_| | | |_| |/ / __/ | | __/> <| |_ // /_/ \_\_| |_|\__,_|_|\__, /___\___| |_|\___/_/\_\\__| // |___/ int[] Sentence_WC = new int[Sentences.Length]; VaderSharp.SentimentAnalysisResults[] results = new VaderSharp.SentimentAnalysisResults[Sentences.Length]; for (int i = 0; i < Sentences.Length; i++) { results[i] = VADER.PolarityScores(Sentences[i]); Sentence_WC[i] = Sentences[i].Split().Where(x => !string.IsNullOrWhiteSpace(x)).ToArray().Length; } // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| string[] OutputString = new string[8]; OutputString[0] = "\"" + Filename_Clean + "\""; OutputString[1] = "0"; OutputString[2] = TotalSentences.ToString(); OutputString[3] = ""; int TotalWC = 0; if (TotalStringLength > 0) { Dictionary <string, double> Average_Results = new Dictionary <string, double>(); Average_Results.Add("Positive", 0.0); Average_Results.Add("Neutral", 0.0); Average_Results.Add("Negative", 0.0); Average_Results.Add("Compound", 0.0); for (int i = 0; i < TotalSentences; i++) { TotalWC += Sentence_WC[i]; Average_Results["Positive"] += results[i].Positive; Average_Results["Neutral"] += results[i].Neutral; Average_Results["Negative"] += results[i].Negative; Average_Results["Compound"] += results[i].Compound; //write the sentence-level output string[] OutputString_Sentence_Level = new string[8]; OutputString_Sentence_Level[0] = "\"" + Filename_Clean + "\""; OutputString_Sentence_Level[1] = Sentence_WC[i].ToString(); OutputString_Sentence_Level[2] = "\"" + Sentences[i].Replace("\"", "\"\"") + "\""; OutputString_Sentence_Level[3] = ""; if (results[i].Compound > 0.05) { OutputString_Sentence_Level[3] = "pos"; } else if (results[i].Compound > -0.05) { OutputString_Sentence_Level[3] = "neut"; } else { OutputString_Sentence_Level[3] = "neg"; } OutputString_Sentence_Level[4] = results[i].Compound.ToString(); OutputString_Sentence_Level[5] = results[i].Positive.ToString(); OutputString_Sentence_Level[6] = results[i].Negative.ToString(); OutputString_Sentence_Level[7] = results[i].Neutral.ToString(); outputFileSentences.WriteLine(String.Join(",", OutputString_Sentence_Level)); } Average_Results["Positive"] = Average_Results["Positive"] / (double)TotalSentences; Average_Results["Neutral"] = Average_Results["Neutral"] / (double)TotalSentences; Average_Results["Negative"] = Average_Results["Negative"] / (double)TotalSentences; Average_Results["Compound"] = Average_Results["Compound"] / (double)TotalSentences; OutputString[1] = TotalWC.ToString(); OutputString[4] = Average_Results["Compound"].ToString(); OutputString[5] = Average_Results["Positive"].ToString(); OutputString[6] = Average_Results["Negative"].ToString(); OutputString[7] = Average_Results["Neutral"].ToString(); if (Average_Results["Compound"] > 0.05) { OutputString[3] = "pos"; } else if (Average_Results["Compound"] > -0.05) { OutputString[3] = "neut"; } else { OutputString[3] = "neg"; } } else { OutputString[2] = ""; for (int i = 3; i < 8; i++) { OutputString[i + 3] = ""; } } outputFile.WriteLine(String.Join(",", OutputString)); } } } } catch { MessageBox.Show("VADER-Tots encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while VADER-Tots is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { DictionaryData DictData = (DictionaryData)e.Argument; //selects the text encoding based on user selection Encoding InputSelectedEncoding = null; Encoding OutputSelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { InputSelectedEncoding = Encoding.GetEncoding(InputEncodingDropdown.SelectedItem.ToString()); OutputSelectedEncoding = Encoding.GetEncoding(OutputEncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(DictData.TextFileFolder, DictData.FileExtension, SearchDepth); try { //we want to be conservative and limit the number of threads to the number of processors that we have var options = new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }; Parallel.ForEach(files, options, (string fileName) => { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); string SubDirStructure = Path.GetDirectoryName(fileName).Replace(DictData.TextFileFolder, "").TrimStart('\\'); //creates subdirs if they don't exist string Output_Location = DictData.OutputFileLocation + '\\' + SubDirStructure; if (!Directory.Exists(Output_Location)) { Directory.CreateDirectory(Output_Location); } Output_Location = Path.Combine(Output_Location, Path.GetFileName(fileName)); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Processing: " + Filename_Clean; FilenameLabel.Invalidate(); FilenameLabel.Update(); FilenameLabel.Refresh(); Application.DoEvents(); }); // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| using (StreamReader inputfile = new StreamReader(fileName, InputSelectedEncoding)) { string readText = inputfile.ReadToEnd(); if (DictData.FixNULtermination) { readText = string.Join("", readText.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries)); } //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(Output_Location, FileMode.Create), OutputSelectedEncoding)) { outputFile.Write(readText); } } }); } catch { MessageBox.Show("TranscodeTXT encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while transcoding", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { DictionaryData DictData = (DictionaryData)e.Argument; //selects the text encoding based on user selection Encoding OutputSelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { OutputSelectedEncoding = Encoding.GetEncoding(OutputEncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(DictData.TextFileFolder, DictData.FileExtension, SearchDepth); try { using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), OutputSelectedEncoding)) { outputFile.WriteLine("\"Filename\",\"Created\",\"FileSizeKB\",\"Encoding\""); //add some CODE TO WRITE THE HEADER FOR YOUR CSV FILE HERE!!!! foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; FilenameLabel.Invalidate(); FilenameLabel.Update(); FilenameLabel.Refresh(); Application.DoEvents(); }); string[] OutputString = new string[4]; FileInfo oFileInfo = new FileInfo(fileName); string FileEncodingDetected = ExamineTXT.SimpleHelpers.FileEncoding.DetectFileEncoding(fileName); string DetectedEncodingString = "[UNKNOWN]"; if (FileEncodingDetected != null) { DetectedEncodingString = FileEncodingDetected; } OutputString[0] = fileName; OutputString[1] = oFileInfo.CreationTime.ToString(); OutputString[2] = (oFileInfo.Length / 1024.0).ToString("#.##"); OutputString[3] = DetectedEncodingString; outputFile.WriteLine("\"" + string.Join("\",\"", OutputString) + "\""); } } } catch { MessageBox.Show("ExamineTXT encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while transcoding", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { //set up our sentence boundary detection Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled); //selects the text encoding based on user selection Encoding SelectedEncoding = null; bool SpeakerMultipleLines = false; bool UsingRegex = false; string RegExString = ""; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); SpeakerMultipleLines = SpeakersMultipleLinesCheckbox.Checked; RegExString = RegexTextBox.Text; }); Regex CompiledRegex = new Regex(RegExString, RegexOptions.Compiled); if (!string.IsNullOrEmpty(RegExString)) { UsingRegex = true; } //the very first thing that we want to do is set up our speaker list string[] SpeakerList = NewlineClean.Split(SpeakerListTextBox.Text); //if we want things to be case-insensitive, this is what we'd do: //string[] SpeakerListList = NewlineClean.Split(SpeakerListTextBox.Text.ToLower()); //remove blanks SpeakerList = SpeakerList.Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); int SpeakerListLength = SpeakerList.Length; //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth); string outputFolder = System.IO.Path.Combine(((string[])e.Argument)[1], "ConverSplitter_Output"); try { System.IO.Directory.CreateDirectory(outputFolder); } catch { MessageBox.Show("ConverSplitterPlus could not create your output folder.\r\nIs your output directory write protected?"); e.Cancel = true; } try { foreach (string fileName in files) { string outputFolder_Subs = ""; if (SearchDepth == SearchOption.AllDirectories) { string subfolder = Path.GetDirectoryName(fileName).Replace(((string[])e.Argument)[0], ""); outputFolder_Subs = System.IO.Path.Combine(outputFolder, subfolder.Trim(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar)); try { System.IO.Directory.CreateDirectory(outputFolder_Subs); } catch { MessageBox.Show("ConverSplitterPlus could not create a subdirectory in your output folder.\r\nIs your output directory write protected?"); e.Cancel = true; } } //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); Dictionary <string, string> Text_Split = new Dictionary <string, string>(); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //do stuff here string[] readText_Lines = NewlineClean.Split(File.ReadAllText(fileName, SelectedEncoding)); int NumberOfLines = readText_Lines.Length; //loop through all of the lines in each text string PreviousSpeaker = ""; for (int i = 0; i < NumberOfLines; i++) { string CurrentLine = readText_Lines[i]; if (UsingRegex) { CurrentLine = CompiledRegex.Replace(CurrentLine, "").Trim(); } else { CurrentLine = CurrentLine.Trim(); } //if the line is empty, move along... move along if (CurrentLine.Length == 0) { continue; } bool FoundSpeaker = false; //loop through each speaker in list to see if the line starts with their name for (int j = 0; j < SpeakerListLength; j++) { // here's what we do if we find a match if (CurrentLine.StartsWith(SpeakerList[j])) { FoundSpeaker = true; PreviousSpeaker = SpeakerList[j]; //clean up the line to remove the speaker tag from the beginning int Place = CurrentLine.IndexOf(SpeakerList[j]); CurrentLine = CurrentLine.Remove(Place, SpeakerList[j].Length).Insert(Place, "").Trim() + "\r\n"; if (Text_Split.ContainsKey(SpeakerList[j])) { Text_Split[SpeakerList[j]] += CurrentLine; } else { Text_Split.Add(SpeakerList[j], CurrentLine); } //break to the next line in the text break; } } //what we will do if no speaker was found if ((FoundSpeaker == false) && (PreviousSpeaker != "")) { if (SpeakerMultipleLines) { Text_Split[PreviousSpeaker] += CurrentLine.Trim() + "\r\n"; } } //end of for loop through each line } //here's where we want to write the output! hooray! foreach (KeyValuePair <string, string> entry in Text_Split) { string OutputFilename = Path.GetFileNameWithoutExtension(fileName) + ";" + entry.Key + ".txt"; //clean up broken filenames foreach (var c in Path.GetInvalidFileNameChars()) { OutputFilename = OutputFilename.Replace(c, '_'); } //set the full path of our output if (SearchDepth == SearchOption.AllDirectories) { OutputFilename = System.IO.Path.Combine(outputFolder_Subs, OutputFilename); } else { OutputFilename = System.IO.Path.Combine(outputFolder, OutputFilename); } // write the output using (StreamWriter outputFile = new StreamWriter(new FileStream(OutputFilename, FileMode.Create, FileAccess.Write), SelectedEncoding)) { outputFile.Write(entry.Value); } } //end of for loop through each file } //end of try block } catch { MessageBox.Show("ConverSplitterPlus could not open your output file\r\nfor writing. Is the file open in another application?"); } }
private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Loading model... please wait..."; }); // Path to the folder with models var segmenterData = Path.Combine(Path.GetDirectoryName(AppDomain.CurrentDomain.BaseDirectory), @"data"); var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("inputEncoding", SelectedEncoding.ToString()); props.setProperty("sighanPostProcessing", "true"); var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth); try { string outputdir = Path.Combine(((string[])e.Argument)[1]); Directory.CreateDirectory(outputdir); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //do stuff here string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower(); string TokenResults = segmenter.classifyToString(readText); using (System.IO.StreamWriter fileout = new StreamWriter(Path.Combine(outputdir, Filename_Clean), false, SelectedEncoding)) { fileout.Write(TokenResults); } } } catch { MessageBox.Show("ZhToken encountered a problem while trying to tokenize/write a file."); } }
private void DetectSpeakersBGWorker_DoWork(object sender, DoWorkEventArgs e) { //set up our sentence boundary detection Regex NewlineClean = new Regex(@"[\r\n]+", RegexOptions.Compiled); //selects the text encoding based on user selection Encoding SelectedEncoding = null; bool UsingRegex = false; string RegExString = ""; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); RegExString = RegexTextBox.Text; }); Regex CompiledRegex = new Regex(RegExString, RegexOptions.Compiled); if (!string.IsNullOrEmpty(RegExString)) { UsingRegex = true; } //make sure that we convert our max length to an integer //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth); //pull out the arguments and put them into more accessible variable names int MaxTagLength = int.Parse(((string[])e.Argument)[1]); string DelimiterString = ((string[])e.Argument)[2]; int DelimiterLength = DelimiterString.Length; HashSet <string> SpeakerList = new HashSet <string>(); try { foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //do stuff here string[] readText_Lines = NewlineClean.Split(File.ReadAllText(fileName, SelectedEncoding)); int NumberOfLines = readText_Lines.Length; //loop through all of the lines in each text for (int i = 0; i < NumberOfLines; i++) { string CurrentLine = readText_Lines[i]; if (UsingRegex) { CurrentLine = CompiledRegex.Replace(CurrentLine, "").Trim(); } else { CurrentLine = CurrentLine.Trim(); } int IndexOfDelimiter = CurrentLine.IndexOf(DelimiterString); if (IndexOfDelimiter > -1) { string SpeakerTag = CurrentLine.Substring(0, IndexOfDelimiter + DelimiterLength); if ((SpeakerTag.Length <= MaxTagLength) && !SpeakerList.Contains(SpeakerTag)) { SpeakerList.Add(SpeakerTag); } } //end of for loop through each line } //end of for loop through each file } //end of try block } catch { MessageBox.Show("ConverSplitterPlus encountered an issue while opening / scanning your files.\r\n?Are your text files open in another program?"); } e.Result = SpeakerList; }
private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e) { BGWorkerData BGData = (BGWorkerData)e.Argument; BGData.NumberOfMatches = new uint[BGData.RegexArray.Length]; BGData.TotalFilesMatched = new uint[BGData.RegexArray.Length]; for (int i = 0; i < BGData.NumberOfMatches.Length; i++) { BGData.NumberOfMatches[i] = 0; BGData.TotalFilesMatched[i] = 0; } //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(BGData.TextFileFolder, BGData.Filetype, SearchDepth); try { //we want to be conservative and limit the number of threads to the number of processors that we have var options = new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }; Parallel.ForEach(files, options, (string fileName) => { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); string SubDirStructure = Path.GetDirectoryName(fileName).Replace(BGData.TextFileFolder, "").TrimStart('\\'); //creates subdirs if they don't exist string Output_Location = BGData.OutputFileLocation + '\\' + SubDirStructure; if (!Directory.Exists(Output_Location)) { Directory.CreateDirectory(Output_Location); } Output_Location = Path.Combine(Output_Location, Path.GetFileName(fileName)); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Processing: " + Filename_Clean; FilenameLabel.Invalidate(); FilenameLabel.Update(); FilenameLabel.Refresh(); Application.DoEvents(); }); //read in the text file, convert everything to lowercase string readText = File.ReadAllText(fileName, SelectedEncoding); if (BGData.CompactWhitespace) { readText = Regex.Replace(readText, @"\s+", " "); } // _ _ _____ _ // / \ _ __ __ _| |_ _ _______ |_ _|____ _| |_ // / _ \ | '_ \ / _` | | | | |_ / _ \ | |/ _ \ \/ / __| // / ___ \| | | | (_| | | |_| |/ / __/ | | __/> <| |_ // /_/ \_\_| |_|\__,_|_|\__, /___\___| |_|\___/_/\_\\__| // |___/ for (int i = 0; i < BGData.RegexArray.Length; i++) { int NumMatches = BGData.RegexArray[i].Matches(readText).Count; if (NumMatches == 0) { continue; } BGData.NumberOfMatches[i] += (uint)NumMatches; BGData.TotalFilesMatched[i] += 1; readText = BGData.RegexArray[i].Replace(readText, BGData.ReplacementArray[i]); } // __ __ _ _ ___ _ _ // \ \ / / __(_) |_ ___ / _ \ _ _| |_ _ __ _ _| |_ // \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __| // \ V V /| | | | || __/ | |_| | |_| | |_| |_) | |_| | |_ // \_/\_/ |_| |_|\__\___| \___/ \__,_|\__| .__/ \__,_|\__| // |_| //open up the output file using (StreamWriter outputFile = new StreamWriter(new FileStream(Output_Location, FileMode.Create), SelectedEncoding)) { outputFile.Write(readText); } }); using (StreamWriter outputFile = new StreamWriter(new FileStream(Path.Combine(BGData.OutputFileLocation, "__TextEmend-Report.csv"), FileMode.Create), SelectedEncoding)) { outputFile.WriteLine("\"RegEx\",\"Replacement\",\"NumberOfMatches\",\"FilesWithPattern\""); for (int i = 0; i < BGData.RegexArray.Length; i++) { outputFile.WriteLine("\"" + BGData.RegexArray[i].ToString() + "\"," + "\"" + BGData.ReplacementArray[i] + "\"," + BGData.NumberOfMatches[i].ToString() + "," + BGData.TotalFilesMatched[i].ToString()); } } } catch { MessageBox.Show("TextEmend encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file(s) while the program is still running. Did any of your input files move, or is your output file being opened/modified by another application? Are you sure that your regular expressions are properly formed?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error); } }