private void btn_WordList_Click(object sender, RibbonControlEventArgs e) { Word.Document doc = Globals.ThisAddIn.Application.ActiveDocument; HashSet <string> wordlist = new HashSet <string>(); foreach (Word.Range rng in TextHelpers.GetText(doc)) { string txt = rng.Text; //strip punctuation txt = TextHelpers.StripPunctuation(txt); //get word list HashSet <string> newwords = TextHelpers.ToWords(txt); wordlist.UnionWith(newwords); } //strip words that are all numbers wordlist = TextHelpers.RemoveNumbers(wordlist); //Create new document Word.Document newdoc = Globals.ThisAddIn.Application.Documents.Add(); Word.Paragraph pgraph; //Intro text pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Heading 1"]); pgraph.Range.Text = "Word List\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Normal"]); pgraph.Range.Text = "This is a proofreading tool. It takes every word in the document, strips the punctuation, removes words that consist only of numbers, and then presents them all in alphabetical order. This is a great way to find typos and inconsistencies.\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.Text = "Capitalization is retained as is. That means that words that appear at the beginning of a sentence will appear capitalized.\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.InsertBreak(Word.WdBreakType.wdSectionBreakContinuous); Word.Section sec = newdoc.Sections[2]; sec.PageSetup.TextColumns.SetCount(3); string[] words = wordlist.ToArray(); Array.Sort(words); pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.Text = string.Join("\n", words) + "\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.InsertBreak(Word.WdBreakType.wdSectionBreakContinuous); newdoc.GrammarChecked = true; }
static public Dictionary <string, uint> GetWordFrequencyList() { Word.Document doc = Globals.ThisAddIn.Application.ActiveDocument; Dictionary <string, uint> wordlist = new Dictionary <string, uint>(); Regex re_allnums = new Regex(@"^\d+$"); IEnumerable <Word.Range> textranges = TextHelpers.GetText(doc); //d.pbMax = textranges.Count(); //d.pbVal = 0; foreach (Word.Range rng in textranges) { //d.pbVal++; //Application.StatusBar = Left("Importing Data... | " & Format(App.EndTime - App.StartTime, "hh:mm:ss") & " | (" & Ribbon.fileNbr & " of " & App.FileTotal & ") " & Format(Ribbon.fileNbr / App.FileTotal, "0.0%") & " | " & filePath, 255) //Word.Application.StatusBar = ""; //Word.Application.StatusBar = "test in status bar"; string txt = rng.Text; //strip punctuation txt = TextHelpers.StripPunctuation(txt); string[] substrs = Regex.Split(txt, @"\s+"); foreach (string word in substrs) { Match m = re_allnums.Match(word); if (!m.Success) { if (word.Trim() != "") { if (wordlist.ContainsKey(word)) { wordlist[word]++; } else { wordlist.Add(word, 1); } } } } } return(wordlist); }
static public HashSet <string> GetWordList() { Word.Document doc = Globals.ThisAddIn.Application.ActiveDocument; HashSet <string> wordlist = new HashSet <string>(); foreach (Word.Range rng in TextHelpers.GetText(doc)) { string txt = rng.Text; //strip punctuation txt = TextHelpers.StripPunctuation(txt); //get word list HashSet <string> newwords = TextHelpers.ToWords(txt); wordlist.UnionWith(newwords); } //strip words that are all numbers wordlist = TextHelpers.RemoveNumbers(wordlist); return(wordlist); }
private void btn_PhraseFrequency_Click(object sender, RibbonControlEventArgs e) { Word.Document doc = Globals.ThisAddIn.Application.ActiveDocument; uint newminlen; uint newmaxlen; UInt32.TryParse(edit_MinPhraseLen.Text, out newminlen); UInt32.TryParse(edit_MaxPhraseLen.Text, out newmaxlen); if ((newminlen != 0) && (newmaxlen != 0) && (newminlen <= newmaxlen)) { Properties.Settings.Default.minphraselen = newminlen; Properties.Settings.Default.maxphraselen = newmaxlen; Properties.Settings.Default.Save(); Dictionary <string, uint> phrases = new Dictionary <string, uint>(); //Iterate through all text foreach (Word.Range rng in TextHelpers.GetText(doc)) { //Break into sentences foreach (Word.Range sentence in rng.Sentences) { //Strip punctuation string nopunc = TextHelpers.StripPunctuation(sentence.Text); nopunc = nopunc.Replace(" ", " "); //Break into words string[] words = nopunc.Split(' '); //Extract phrases for (uint i = newminlen; i <= newmaxlen; i++) { for (int start = 0; start < words.Length - i; start++) { List <string> phraselst = new List <string>(); for (int idx = 0; idx < i; idx++) { phraselst.Add(words[start + idx]); } string phrase = string.Join(" ", phraselst).ToLower(); //Add to data structre if (phrases.ContainsKey(phrase)) { phrases[phrase]++; } else { phrases[phrase] = 1; } } } } } //Display results //Create new document Word.Document newdoc = Globals.ThisAddIn.Application.Documents.Add(); Word.Paragraph pgraph; //Intro text pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Heading 1"]); pgraph.Range.Text = "Phrase Frequency List\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Normal"]); pgraph.Range.Text = "Punctuation (other than apostrophes) has been removed. All words have been lowercased for comparison.\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.InsertBreak(Word.WdBreakType.wdSectionBreakContinuous); Word.Section sec = newdoc.Sections[2]; sec.PageSetup.TextColumns.SetCount(2); var phraselist = phrases.Where(x => x.Value > 1).ToList(); phraselist.Sort((pair1, pair2) => pair2.Value.CompareTo(pair1.Value)); foreach (var pair in phraselist) { pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Normal"]); pgraph.Range.Text = pair.Key + "\t" + pair.Value.ToString() + "\n"; } pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.InsertBreak(Word.WdBreakType.wdSectionBreakContinuous); newdoc.GrammarChecked = true; } else { MessageBox.Show("The phrase length fields must contain numbers greater than zero, and the minimum length must be less than or equal to the maximum length."); } }
private void btn_WordFreq_Click(object sender, RibbonControlEventArgs e) { ProgressDialog d = new ProgressDialog(); d.Show(); Stopwatch watch = new Stopwatch(); watch.Start(); Word.Document doc = Globals.ThisAddIn.Application.ActiveDocument; Dictionary <string, uint> wordlist = new Dictionary <string, uint>(); Regex re_allnums = new Regex(@"^\d+$"); IEnumerable <Word.Range> textranges = TextHelpers.GetText(doc); d.pbMax = textranges.Count(); d.pbVal = 0; foreach (Word.Range rng in textranges) { d.pbVal++; string txt = rng.Text; //strip punctuation txt = TextHelpers.StripPunctuation(txt); string[] substrs = Regex.Split(txt, @"\s+"); foreach (string word in substrs) { Match m = re_allnums.Match(word); if (!m.Success) { if (word.Trim() != "") { if (wordlist.ContainsKey(word)) { wordlist[word]++; } else { wordlist.Add(word, 1); } } } } } Debug.WriteLine("Counts tabulated. Time elapsed: " + watch.Elapsed.ToString()); watch.Restart(); //Create new document Word.Document newdoc = Globals.ThisAddIn.Application.Documents.Add(); Word.Paragraph pgraph; //Intro text pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Heading 1"]); pgraph.Range.Text = "Word Frequency List\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Normal"]); pgraph.Range.Text = "Capitalization is retained as is. That means that words that appear at the beginning of a sentence will appear capitalized. Don't forget that you can sort the table!\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.set_Style(newdoc.Styles["Normal"]); pgraph.Range.Text = "Total words found (case sensitive): " + wordlist.Count.ToString() + "\n"; pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.InsertBreak(Word.WdBreakType.wdSectionBreakContinuous); Word.Section sec = newdoc.Sections[2]; sec.PageSetup.TextColumns.SetCount(3); var words = wordlist.ToList(); words.Sort((pair1, pair2) => pair2.Value.CompareTo(pair1.Value)); newdoc.Tables.Add(pgraph.Range, words.Count, 2); //newdoc.Tables.Add(pgraph.Range, 1, 2); newdoc.Tables[1].AutoFitBehavior(Word.WdAutoFitBehavior.wdAutoFitContent); newdoc.Tables[1].AllowAutoFit = true; d.pbMax = words.Count; d.pbVal = 0; int row = 1; foreach (var pair in words) { d.pbVal++; //newdoc.Tables[1].Rows.Add(); Word.Cell cell = newdoc.Tables[1].Cell(row, 1); cell.Range.Text = pair.Key; cell = newdoc.Tables[1].Cell(row, 2); cell.Range.Text = pair.Value.ToString(); row++; } pgraph = newdoc.Content.Paragraphs.Add(); pgraph.Range.InsertBreak(Word.WdBreakType.wdSectionBreakContinuous); newdoc.GrammarChecked = true; Debug.WriteLine("All done. Time elapsed: " + watch.Elapsed.ToString()); watch.Stop(); d.Hide(); }