void Editor_KeyUp(object sender, KeyEventArgs e) { TextRange tr = WordBreaker.GetWordRange(Editor.CaretPosition); // Make sure we have at least two chars if (tr.Text.Length >= 2) { var results = VirtualMailBox.VirtualMailBox.Current.Labels.Keys .Where(k => k.IndexOf(tr.Text, StringComparison.InvariantCultureIgnoreCase) > -1) .Select(k => new LabelsContainer(k)) .ToList(); var prevResults = AutoCompletionListBox.ItemsSource as List <LabelsContainer>; if (prevResults != null && prevResults.Count == results.Count) { return; } // No result, hide list (if it was allready shown) if (results.Count > 0) { // Get position of caret var pos = tr.Start.GetCharacterRect(LogicalDirection.Forward); ShowList(results, pos); return; } } // Nothing to show, hide the list HideList(); }
private static void OnFind(object sender, ExecutedRoutedEventArgs e) { RichEditorControl control = (RichEditorControl)sender; string findText = (string)control.FindComboBox.Text; TextPointer navigator = control.RichTextBox.Selection.IsEmpty ? control.RichTextBox.Document.ContentStart : control.RichTextBox.Selection.End.GetNextInsertionPosition(LogicalDirection.Forward); while (navigator != null && navigator.CompareTo(control.RichTextBox.Document.ContentEnd) < 0) { TextRange wordRange = WordBreaker.GetWordRange(navigator); if (wordRange == null) { break; } string wordText = wordRange.Text; if (wordText == findText) { control.RichTextBox.Selection.Select(wordRange.Start, wordRange.End); return; } navigator = wordRange.End.GetNextInsertionPosition(LogicalDirection.Forward); } }
void ProcessCurrentWord() { TextRange range = WordBreaker.GetWordRange(Editor.CaretPosition); string text = range.Text.Trim(); // Validate if the email address is valid if (!SourceAddress.IsValidEmail(text)) { if (ValidationEnabled) { range.ApplyPropertyValue(TextBlock.ForegroundProperty, FindResource("TabAndLightButtonText")); SuppressListForCurrentWord = false; } return; } SuppressListForCurrentWord = false; SourceAddress address = new SourceAddress(text); AddRecipient(address); // Notify listeners of new entry RebuildRecipientsList(); }
private static Document ConstructDocument(string pageContents) { StopWordRemover stopWordRemover = new StopWordRemover(); SStemmer stemmer = new SStemmer(); WordBreaker wb = new WordBreaker(); SentenceBreaker sb = SentenceBreaker.Instance; List <Statement> statements = new List <Statement>(); string[] statementsString = sb.BreakIntoSentences(pageContents); foreach (string statementString in statementsString) { string[] wordsString = wb.BreakParagraph(statementString); wordsString = stopWordRemover.RemoveStopWords(wordsString); wordsString = stemmer.StemWords(wordsString); List <Word> words = new List <Word>(); foreach (string wordString in wordsString) { words.Add(new Word(wordString)); } statements.Add(new Statement(words.ToArray())); } return(new Document(statements.ToArray())); }
public void AddVisualLabel(LabelsContainer source, bool addToMessage) { // Create display object var ctrl = new ContentControl { Content = source }; var container = new InlineUIContainer(ctrl, Editor.CaretPosition); contentEnd = container.ContentEnd; if (addToMessage && Message != null) { Message.AddLabel(new Label(source.Labelname)); } // Remove any typed text WordBreaker.GetWordRange(Editor.CaretPosition).Text = String.Empty; // Move caret to end of what was the word Editor.CaretPosition = contentEnd; Editor.ShowWatermark = false; // Hide dropdown list if it is visible HideList(); }
private void richTextBox_MouseDown(object sender, MouseEventArgs args) { //TODO: respond to text select if (args.RightButton == MouseButtonState.Pressed) { TextPointer location = rtb_Editor.GetPositionFromPoint(Mouse.GetPosition(rtb_Editor), true); TextRange wordRange = WordBreaker.GetWordRange(location); //TODO: what to do with this text maybe add an option to highlight it in context menu? } }
private void RichTextBoxEditor_KeyUp(object sender, KeyEventArgs e) { if (e.KeyCode == Keys.F4) { string word = toolStripStatusCurrentWord.Text; if (!bangla.Any()) { WordBreaker breaker = new WordBreaker(option.Location.DBFolder); bangla = breaker.ReadWordFromRepository(); } Suggest(word); return; } var posInLine = richTextBoxEditor.SelectionStart - richTextBoxEditor.GetFirstCharIndexOfCurrentLine(); toolStripStatusCursorPosition.Text = "" + posInLine; string guessWord = richTextBoxEditor.Text; int start = richTextBoxEditor.SelectionStart; int previousSpace = 0; for (int i = start - 1; i >= 0; i--) { char ch = richTextBoxEditor.Text[i]; previousSpace = i; if (char.IsWhiteSpace(ch)) { break; } } int nextSpace = start; for (int i = start; i < richTextBoxEditor.Text.Length;) { char ch = richTextBoxEditor.Text[i]; nextSpace = ++i; if (char.IsWhiteSpace(ch)) { break; } } if (previousSpace >= nextSpace) { guessWord = ""; WriteLine("Space here."); } else { int totalchar = nextSpace - previousSpace; guessWord = richTextBoxEditor.Text.Substring(previousSpace, totalchar); } guessWord = guessWord.Trim(); toolStripStatusCurrentWord.Text = guessWord; textBoxTheWord.Text = guessWord; }
void ProcessCurrentWord() { TextRange range = WordBreaker.GetWordRange(Editor.CaretPosition); string text = range.Text.Trim(); if (!String.IsNullOrEmpty(text)) { AddVisualLabel(new LabelsContainer(text), true); } }
void CheckExistingLabel() { TextRange range = WordBreaker.GetWordRange(Editor.CaretPosition); string text = range.Text.Trim(); if (VirtualMailBox.VirtualMailBox.Current.Labels.ContainsKey(text)) { AddVisualLabel(new LabelsContainer(text), true); } }
void UserControl_LostKeyboardFocus(object sender, KeyboardFocusChangedEventArgs e) { TextRange tr = WordBreaker.GetWordRange(Editor.CaretPosition); // Clear any text which might have not been processed yet tr.Text = String.Empty; HideList(); if (HideOnEmpty && Message.LabelsList.Count(l => l.LabelType == LabelType.Custom) == 0) { Visibility = Visibility.Collapsed; } }
private static Statement StemStatement(Statement statement) { SStemmer stemmer = new SStemmer(); WordBreaker wb = new WordBreaker(); StopWordRemover stopWordRemover = new StopWordRemover(); string[] wordsString = wb.BreakParagraph(statement.ToString()); wordsString = stopWordRemover.RemoveStopWords(wordsString); wordsString = stemmer.StemWords(wordsString); List <Word> words = new List <Word>(); foreach (string wordString in wordsString) { words.Add(new Word(wordString)); } return(new Statement(words.ToArray())); }
internal static void RozdelitNaSlova(string vstup, string vystup) { WordBreaker wb = new WordBreaker(); string input = vstup; if (!File.Exists(input)) { return; } wb.Input = input; wb.Output = vystup; wb.XmlIdFormat = "w-{0}"; wb.IgnoredElements = new List <string>(new[] { "teiHeader", "resp", "repository", "idno", "oVar", "catDesc" }); wb.Punctation = ",.:;…„“‚‘?!—/[]´+─≈+−'#›‹()"; wb.Run(); }
/// <summary> /// 0980-09FD /// 2432-2558 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void ReadWordsToolStripMenuItem_Click(object sender, EventArgs e) { try { Cursor.Current = Cursors.WaitCursor; WordBreaker breaker = new WordBreaker(option.Location.DBFolder); string wordFile = Path.Combine(option.Location.DBFolder, "Bangla.txt"); breaker.InitializeFolder(); breaker.BreakFile(wordFile); breaker.WriteBack(); WriteLine("Completed word breaking process."); } catch (Exception ex) { Debug.WriteLine(ex); } finally { Cursor.Current = Cursors.Default; } }
void SearchContacts() { TextRange tr = WordBreaker.GetWordRange(Editor.CaretPosition); // Make sure we have at least two chars if (tr.Text.Length >= 3) { var prevResults = AutoCompletionListBox.ItemsSource as List <Profile>; List <Profile> results; var q = mailbox.Profiles.Where( p => p.SourceAddress.ToString().IndexOf(Text, StringComparison.InvariantCultureIgnoreCase) > -1) .Where(p => p.SourceChannelId == 0 || p.SourceChannel.Charasteristics.SupportsPrivateMessage) .Union(mailbox.Persons .Where(r => r.Name.IndexOf(Text, StringComparison.InvariantCultureIgnoreCase) > -1) .SelectMany(r => r.Profiles .Where(p => p.SourceChannelId == 0 || p.SourceChannel.Charasteristics.SupportsPrivateMessage))) .OrderByDescending(p => p.Messages.Count) .Distinct() .Take(10); using (mailbox.Profiles.ReaderLock) results = q.ToList(); if (prevResults != null && prevResults.Count == results.Count) { return; } if (results.Count > 0) { ShowList(results); return; } } HideList(); }
public void AddRecipient(object source) { TextRange range = WordBreaker.GetWordRange(Editor.CaretPosition); // Create display object ContentControl ctrl = new ContentControl(); ctrl.Content = source; var container = new InlineUIContainer(ctrl, Editor.CaretPosition); contentEnd = container.ContentEnd; // Remove actual text range.Text = String.Empty; // Move caret to end of what was the word Editor.CaretPosition = contentEnd; // Rebuild list of recipients on insert RebuildRecipientsList(); }
/// <summary> /// Event handler for RichTextBox.TextChanged event. /// </summary> private void TextChangedEventHandler(object sender, TextChangedEventArgs e) { if (!this.pasteFlag || this.Document == null) { return; } // Temporarily disable TextChanged event handler, since following code might insert Hyperlinks, // which will raise another TextChanged event. this.TextChanged -= this.TextChangedEventHandler; TextPointer navigator = this.Document.ContentStart; while (navigator != null && navigator.CompareTo(this.Document.ContentEnd) < 0) { TextRange wordRange = WordBreaker.GetWordRange(navigator); if (wordRange == null || wordRange.IsEmpty) { // No more words in the document. break; } string wordText = wordRange.Text; if (wordText == "www.microsoft.com" && !HyperlinkHelper.IsInHyperlinkScope(wordRange.Start) && !HyperlinkHelper.IsInHyperlinkScope(wordRange.End)) { Hyperlink hyperlink = new Hyperlink(wordRange.Start, wordRange.End); navigator = hyperlink.ElementEnd.GetNextInsertionPosition(LogicalDirection.Forward); } else { navigator = wordRange.End.GetNextInsertionPosition(LogicalDirection.Forward); } } this.TextChanged += this.TextChangedEventHandler; this.pasteFlag = false; }
/// <summary> /// Event handler for KeyDown event to auto-detect hyperlinks on space, enter and backspace keys. /// </summary> private static void OnKeyDown(object sender, KeyEventArgs e) { MyRichTextBox myRichTextBox = (MyRichTextBox)sender; if (e.Key != Key.Back && e.Key != Key.Space && e.Key != Key.Return) { return; } if (!myRichTextBox.Selection.IsEmpty) { myRichTextBox.Selection.Text = String.Empty; } TextPointer caretPosition = myRichTextBox.Selection.Start; if (e.Key == Key.Space || e.Key == Key.Return) { TextRange wordRange = WordBreaker.GetWordRange(caretPosition); string wordText = wordRange.Text; if (wordText == "www.microsoft.com") { // Insert hyperlink element at word boundaries. new Hyperlink(wordRange.Start, wordRange.End); // No need to update RichTextBox caret position, // since we only inserted a Hyperlink ElementEnd following current caretPosition. // Subsequent handling of space input by base RichTextBox will update selection. } } else // Key.Back { TextPointer backspacePosition = caretPosition.GetNextInsertionPosition(LogicalDirection.Backward); Hyperlink hyperlink; if (backspacePosition != null && HyperlinkHelper.IsHyperlinkBoundaryCrossed(caretPosition, backspacePosition, out hyperlink)) { // Remember caretPosition with forward gravity. This is necessary since we are going to delete // the hyperlink element preceeding caretPosition and after deletion current caretPosition // (with backward gravity) will follow content preceeding the hyperlink. // We want to remember content following the hyperlink to set new caret position at. TextPointer newCaretPosition = caretPosition.GetPositionAtOffset(0, LogicalDirection.Forward); // Deleting the hyperlink is done using logic below. // 1. Copy its children Inline to a temporary array. InlineCollection hyperlinkChildren = hyperlink.Inlines; Inline[] inlines = new Inline[hyperlinkChildren.Count]; hyperlinkChildren.CopyTo(inlines, 0); // 2. Remove each child from parent hyperlink element and insert it after the hyperlink. for (int i = inlines.Length - 1; i >= 0; i--) { hyperlinkChildren.Remove(inlines[i]); hyperlink.SiblingInlines.InsertAfter(hyperlink, inlines[i]); } // 3. Apply hyperlink's local formatting properties to inlines (which are now outside hyperlink scope). LocalValueEnumerator localProperties = hyperlink.GetLocalValueEnumerator(); TextRange inlineRange = new TextRange(inlines[0].ContentStart, inlines[inlines.Length - 1].ContentEnd); while (localProperties.MoveNext()) { LocalValueEntry property = localProperties.Current; DependencyProperty dp = property.Property; object value = property.Value; if (!dp.ReadOnly && dp != Inline.TextDecorationsProperty && // Ignore hyperlink defaults. dp != TextElement.ForegroundProperty && dp != BaseUriHelper.BaseUriProperty && !HyperlinkHelper.IsHyperlinkProperty(dp)) { inlineRange.ApplyPropertyValue(dp, value); } } // 4. Delete the (empty) hyperlink element. hyperlink.SiblingInlines.Remove(hyperlink); // 5. Update selection, since we deleted Hyperlink element and caretPosition was at that Hyperlink's end boundary. myRichTextBox.Selection.Select(newCaretPosition, newCaretPosition); } } }
void Editor_PreviewKeyDown(object sender, KeyEventArgs e) { wordFlipper.Delay(); switch (e.Key) { case Key.Up: // Move selection up if (AutoCompletionListBox.SelectedIndex > 0) { AutoCompletionListBox.SelectedIndex--; } e.Handled = true; break; case Key.Down: // Move selection down if (AutoCompletionListBox.SelectedIndex < AutoCompletionListBox.Items.Count) { AutoCompletionListBox.SelectedIndex++; } e.Handled = true; break; case Key.Enter: // Accept selection if (IsPopupOpen) { InsertSelectedContact(); } break; case Key.Escape: // Restore state before we opened the list HideList(); e.Handled = true; break; case Key.Tab: if (IsPopupOpen) { InsertSelectedContact(); } else { TextRange range = WordBreaker.GetWordRange(Editor.CaretPosition); string text = range.Text.Trim(); // Break out when use tabs and nothing has been entered if (String.IsNullOrEmpty(text.Trim())) { return; } ProcessCurrentWord(); } e.Handled = true; break; case Key.Space: case Key.OemComma: case Key.OemSemicolon: // Add word being typed in by user ProcessCurrentWord(); break; default: // Clear color of range TextRange currentRange = WordBreaker.GetWordRange(Editor.CaretPosition); currentRange.ApplyPropertyValue(TextBlock.ForegroundProperty, Brushes.Black); break; } }
// moves an endpoint backward a certain number of units. // the endpoint is just an index into the text so it could represent either // the endpoint. private int MoveEndpointBackward(int index, TextUnit unit, int count, out int moved) { switch (unit) { case TextUnit.Character: { int limit = _provider.GetTextLength(); ValidateEndpoints(); int oneBasedIndex = index + 1; moved = Math.Max(count, -oneBasedIndex); index = index + moved; index = index < 0 ? 0 : index; } break; case TextUnit.Word: { string text = _provider.GetText(); ValidateEndpoints(); #if WCP_NLS_ENABLED // use the same word breaker as Avalon Text. WordBreaker breaker = new WordBreaker(); TextContainer container = new TextContainer(text); TextNavigator navigator = new TextNavigator(index, container); // move backward one word break for each count for (moved = 0; moved > count && index > 0; moved--) { if (!breaker.MoveToPreviousWordBreak(navigator)) break; } index = navigator.Position; #else for (moved = 0; moved > count && index > 0; moved--) { for (index--; !AtWordBoundary(text, index); index--) ; } #endif } break; case TextUnit.Line: { // Note count < 0. // Get 1-based line. int line = _provider.LineFromChar(index) + 1; int lineMax = _provider.GetLineCount(); // Truncate the count to the number of available lines. int actualCount = Math.Max(count, -line); moved = actualCount; if (actualCount == -line) { // We are moving by the maximum number of possible lines, // so we know the resulting index will be 0. index = 0; // If a line other than the first consists of only "\r\n", // you can move backwards past this line and the position changes, // hence this is counted. The first line is special, though: // if it is empty, and you move say from the second line back up // to the first, you cannot move further; however if the first line // is nonempty, you can move from the end of the first line to its // beginning! This latter move is counted, but if the first line // is empty, it is not counted. // Recalculate the value of "moved". // The first line is empty if it consists only of // a line separator sequence. bool firstLineEmpty = ((lineMax > 1 && _provider.LineIndex(1) == _lineSeparator.Length) || lineMax == 0); if (moved < 0 && firstLineEmpty) { ++moved; } } else // actualCount > -line { // Move the endpoint to the beginning of the following line, // then back by the line separator length to get to the end // of the previous line, since the Edit control has // no method to get the character index of the end // of a line directly. index = _provider.LineIndex(line + actualCount) - _lineSeparator.Length; } } break; case TextUnit.Paragraph: { // just like moving words but we look for paragraph boundaries instead of // word boundaries. string text = _provider.GetText(); ValidateEndpoints(); for (moved = 0; moved > count && index > 0; moved--) { for (index--; !AtParagraphBoundary(text, index); index--) ; } } break; case TextUnit.Format: case TextUnit.Page: case TextUnit.Document: { // since edit controls are plain text moving one uniform format unit will // take us all the way to the beginning of the document, just like // "pages" and document. // we'll move 1 format unit if we aren't already at the beginning of the // document. Otherwise, we won't move at all. moved = index > 0 ? -1 : 0; index = 0; } break; default: throw new System.ComponentModel.InvalidEnumArgumentException("unit", (int)unit, typeof(TextUnit)); } return index; }
void ITextRangeProvider.ExpandToEnclosingUnit(TextUnit unit) { Misc.SetFocus(_provider._hwnd); switch (unit) { case TextUnit.Character: // if it is a degenerate range then expand it to be one character. // otherwise, leave it as it is. if (Start == End) { int moved; End = MoveEndpointForward(End, TextUnit.Character, 1, out moved); } break; case TextUnit.Word: { // this works same as paragraph except we look for word boundaries instead of paragraph boundaries. // get the text so we can figure out where the boundaries are string text = _provider.GetText(); ValidateEndpoints(); #if WCP_NLS_ENABLED // use the same word breaker that Avalon Text uses. WordBreaker breaker = new WordBreaker(); TextContainer container = new TextContainer(text); // if the starting point of the range is not already at a word break // then move it backwards to the nearest word break. TextNavigator startNavigator = new TextNavigator(Start, container); if (!breaker.IsAtWordBreak(startNavigator)) { breaker.MoveToPreviousWordBreak(startNavigator); Start = startNavigator.Position; } // if the range is degenerate or the ending point of the range is not already at a word break // then move it forwards to the nearest word break. TextNavigator endNavigator = new TextNavigator(End, container); if (Start==End || !breaker.IsAtWordBreak(endNavigator)) { breaker.MoveToNextWordBreak(endNavigator); End = endNavigator.Position; } #else // move start left until we reach a word boundary. for (; !AtWordBoundary(text, Start); Start--) ; // move end right until we reach word boundary (different from Start). End = Math.Min(Math.Max(End, Start + 1), text.Length); for (; !AtWordBoundary(text, End); End++) ; #endif } break; case TextUnit.Line: { if (_provider.GetLineCount() != 1) { int startLine = _provider.LineFromChar(Start); int endLine = _provider.LineFromChar(End); MoveTo(_provider.LineIndex(startLine), _provider.LineIndex(endLine + 1)); } else { MoveTo(0, _provider.GetTextLength()); } } break; case TextUnit.Paragraph: { // this works same as paragraph except we look for word boundaries instead of paragraph boundaries. // get the text so we can figure out where the boundaries are string text = _provider.GetText(); ValidateEndpoints(); // move start left until we reach a paragraph boundary. for (; !AtParagraphBoundary(text, Start); Start--); // move end right until we reach a paragraph boundary (different from Start). End = Math.Min(Math.Max(End, Start + 1), text.Length); for (; !AtParagraphBoundary(text, End); End++); } break; case TextUnit.Format: case TextUnit.Page: case TextUnit.Document: MoveTo(0, _provider.GetTextLength()); break; //break; default: throw new System.ComponentModel.InvalidEnumArgumentException("unit", (int)unit, typeof(TextUnit)); } }
// moves an endpoint forward a certain number of units. // the endpoint is just an index into the text so it could represent either // the endpoint. private int MoveEndpointForward(int index, TextUnit unit, int count, out int moved) { switch (unit) { case TextUnit.Character: { int limit = _provider.GetTextLength() ; ValidateEndpoints(); moved = Math.Min(count, limit - index); index = index + moved; index = index > limit ? limit : index; } break; case TextUnit.Word: { string text = _provider.GetText(); ValidateEndpoints(); #if WCP_NLS_ENABLED // use the same word breaker as Avalon Text. WordBreaker breaker = new WordBreaker(); TextContainer container = new TextContainer(text); TextNavigator navigator = new TextNavigator(index, container); // move forward one word break for each count for (moved = 0; moved < count && index < text.Length; moved++) { if (!breaker.MoveToNextWordBreak(navigator)) break; } index = navigator.Position; #else for (moved = 0; moved < count && index < text.Length; moved++) { for (index++; !AtWordBoundary(text, index); index++) ; } #endif } break; case TextUnit.Line: { // figure out what line we are on. if we are in the middle of a line and // are moving left then we'll round up to the next line so that we move // to the beginning of the current line. int line = _provider.LineFromChar(index); // limit the number of lines moved to the number of lines available to move // Note lineMax is always >= 1. int lineMax = _provider.GetLineCount(); moved = Math.Min(count, lineMax - line - 1); if (moved > 0) { // move the endpoint to the beginning of the destination line. index = _provider.LineIndex(line + moved); } else if (moved == 0 && lineMax == 1) { // There is only one line so get the text length as endpoint index = _provider.GetTextLength(); moved = 1; } } break; case TextUnit.Paragraph: { // just like moving words but we look for paragraph boundaries instead of // word boundaries. string text = _provider.GetText(); ValidateEndpoints(); for (moved = 0; moved < count && index < text.Length; moved++) { for (index++; !AtParagraphBoundary(text, index); index++) ; } } break; case TextUnit.Format: case TextUnit.Page: case TextUnit.Document: { // since edit controls are plain text moving one uniform format unit will // take us all the way to the end of the document, just like // "pages" and document. int limit = _provider.GetTextLength(); ValidateEndpoints(); // we'll move 1 format unit if we aren't already at the end of the // document. Otherwise, we won't move at all. moved = index < limit ? 1 : 0; index = limit; } break; default: throw new System.ComponentModel.InvalidEnumArgumentException("unit", (int)unit, typeof(TextUnit)); } return index; }
public CorrelationMatrix UpdateCorrelationMatrix(CorrelationMatrix existingMatrix, IEnumerable <string> documents) { WordBreaker wordBreaker = new WordBreaker(); StopWordRemover stopwordRemover = new StopWordRemover(); SentenceBreaker sb = SentenceBreaker.Instance; int i = 1; try { Parallel.ForEach(documents, (documentContents, loopState) => //string documentContents in documents) { int documentNumber = Interlocked.Increment(ref i); using (new MonitoredScope("Learning from a document No. " + documentNumber.ToString())) { SStemmer stemmer = new SStemmer(); string[] words; //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium)) { words = sb.BreakIntoWords(documentContents); } //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium)) { words = stemmer.StemWords(words); } //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium)) { words = stopwordRemover.RemoveStopWords(words); } //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium)) { existingMatrix.Add(words); } } Logger.Log("Finished document number: " + documentNumber.ToString()); if (existingMatrix.Words.Count > 100000) { loopState.Break(); } //Logger.Log("Finished document number: " + (i++).ToString() + " unique words: " + correlationMatrix.Words.Count + ", pairs: " + correlationMatrix.Matrix.Count); }); } finally { Logger.Log("Unique words: " + existingMatrix.WordsMetadata.Count + ", Pairs: " + existingMatrix.Matrix.Count); string filename = "autorss_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { new CorrelationMatrixBinarySerializer().Serialize(fs, existingMatrix); } Logger.Log("Correlation Matrix saved to file: " + filename); filename = "autorss_Scopes_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { MonitoredScope.SerializeStatistics(fs); } Logger.Log("MonitoredScopes saved to file: " + filename); } return(existingMatrix); }
private static void CalculateCorrelationFromWikipediaDB(ProgramArguments programArgs) { WordBreaker wordBreaker = new WordBreaker(); StopWordRemover stopwordRemover = new StopWordRemover(); SStemmer stemmer = new SStemmer(); CorrelationMatrix correlationMatrix = new CorrelationMatrix(); string wikipediaPath = @"C:\Users\haabu\Downloads\enwiki-latest-pages-articles.xml\enwiki-latest-pages-articles.xml"; using (XmlReader sr = XmlReader.Create(new FileStream(wikipediaPath, FileMode.Open))) { for (int i = 0; i < programArgs.WikipediaStartArticle; i++) { bool elementFound = sr.ReadToFollowing("text"); if (!elementFound) { break; } } for (int i = programArgs.WikipediaStartArticle; i < programArgs.WikipediaEndArticle; i++) { bool elementFound = sr.ReadToFollowing("text"); if (elementFound) { string pageContents; //using (MonitoredScope scope = new MonitoredScope("Xml Read Element", TraceLevel.Medium)) { sr.ReadStartElement(); pageContents = sr.ReadContentAsString(); } string[] words; //using (MonitoredScope scope = new MonitoredScope("Break Paragraph", TraceLevel.Medium)) { words = wordBreaker.BreakParagraph(pageContents); } //using (MonitoredScope scope = new MonitoredScope("Remove Stop Words", TraceLevel.Medium)) { words = stopwordRemover.RemoveStopWords(words); } //using (MonitoredScope scope = new MonitoredScope("Stem Words", TraceLevel.Medium)) { words = stemmer.StemWords(words); } //using (MonitoredScope scope = new MonitoredScope("Calculate correlation", TraceLevel.Medium)) { correlationMatrix.Add(words); } Logger.Log("Finished document number: " + (i + 1).ToString()); } } } string filename = "autorss_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { BinaryFormatter formatter = new BinaryFormatter(); formatter.Serialize(fs, correlationMatrix); } Logger.Log("Saved to file: " + filename); filename = "autorss_Scopes_" + Guid.NewGuid().ToString(); using (FileStream fs = new FileStream(filename, FileMode.CreateNew)) { MonitoredScope.SerializeStatistics(fs); } Logger.Log("Saved to file: " + filename); }