private void Extract() { log.Info("content type: " + editorPane.GetContentType()); if (!editorPane.GetContentType().Equals("text/html")) { DefaultStyledDocument doc = (DefaultStyledDocument)editorPane.GetDocument(); string text = null; try { text = doc.GetText(0, doc.GetLength()); } catch (Exception e) { throw new Exception(e); } string labeledText = classifier.ClassifyWithInlineXML(text); taggedContents = labeledText; untaggedContents = text; ICollection <string> tags = classifier.Labels(); string background = classifier.BackgroundSymbol(); string tagPattern = string.Empty; foreach (string tag in tags) { if (background.Equals(tag)) { continue; } if (tagPattern.Length > 0) { tagPattern += "|"; } tagPattern += tag; } Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>"); Pattern endPattern = Pattern.Compile("</(" + tagPattern + ")>"); string finalText = labeledText; Matcher m = startPattern.Matcher(finalText); while (m.Find()) { int start = m.Start(); finalText = m.ReplaceFirst(string.Empty); m = endPattern.Matcher(finalText); if (m.Find()) { int end = m.Start(); string tag_1 = m.Group(1); finalText = m.ReplaceFirst(string.Empty); IAttributeSet attSet = GetAttributeSet(tag_1); try { string entity = Sharpen.Runtime.Substring(finalText, start, end); doc.SetCharacterAttributes(start, entity.Length, attSet, false); } catch (Exception ex) { throw new Exception(ex); } log.Info(tag_1 + ": " + Sharpen.Runtime.Substring(finalText, start, end)); } // print error message m = startPattern.Matcher(finalText); } editorPane.Revalidate(); editorPane.Repaint(); } else { untaggedContents = editorPane.GetText(); taggedContents = classifier.ClassifyWithInlineXML(untaggedContents); ICollection <string> tags = classifier.Labels(); string background = classifier.BackgroundSymbol(); string tagPattern = string.Empty; foreach (string tag in tags) { if (background.Equals(tag)) { continue; } if (tagPattern.Length > 0) { tagPattern += "|"; } tagPattern += tag; } Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>"); Pattern endPattern = Pattern.Compile("</(" + tagPattern + ")>"); string finalText = taggedContents; Matcher m = startPattern.Matcher(finalText); while (m.Find()) { string tag_1 = m.Group(1); string color = ColorToHTML(tagToColorMap[tag_1]); string newTag = "<span style=\"background-color: " + color + "; color: white\">"; finalText = m.ReplaceFirst(newTag); int start = m.Start() + newTag.Length; Matcher m1 = endPattern.Matcher(finalText); m1.Find(m.End()); string entity = Sharpen.Runtime.Substring(finalText, start, m1.Start()); log.Info(tag_1 + ": " + entity); finalText = m1.ReplaceFirst("</span>"); m = startPattern.Matcher(finalText); } System.Console.Out.WriteLine(finalText); editorPane.SetText(finalText); editorPane.Revalidate(); editorPane.Repaint(); log.Info(finalText); } saveTaggedAs.SetEnabled(true); }
private void Extract() { log.Info("content type: " + editorPane.GetContentType()); if (!editorPane.GetContentType().Equals("text/html")) { DefaultStyledDocument doc = (DefaultStyledDocument)editorPane.GetDocument(); string text = null; try { text = doc.GetText(0, doc.GetLength()); } catch (Exception e) { log.Err(e); } string labeledText = classifier.ClassifyWithInlineXML(text); taggedContents = labeledText; ICollection <string> tags = classifier.Labels(); string background = classifier.BackgroundSymbol(); StringBuilder tagPattern = new StringBuilder(); foreach (string tag in tags) { if (background.Equals(tag)) { continue; } if (tagPattern.Length > 0) { tagPattern.Append('|'); } tagPattern.Append(tag); } Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>"); Pattern endPattern = Pattern.Compile("</(" + tagPattern + ")>"); string finalText = labeledText; Matcher m = startPattern.Matcher(finalText); while (m.Find()) { int start = m.Start(); finalText = m.ReplaceFirst(string.Empty); m = endPattern.Matcher(finalText); if (m.Find()) { int end = m.Start(); string tag_1 = m.Group(1); finalText = m.ReplaceFirst(string.Empty); IAttributeSet attSet = GetAttributeSet(tag_1); try { string entity = Sharpen.Runtime.Substring(finalText, start, end); doc.SetCharacterAttributes(start, entity.Length, attSet, false); } catch (Exception ex) { log.Err(ex); System.Environment.Exit(-1); } log.Info(tag_1 + ": " + Sharpen.Runtime.Substring(finalText, start, end)); } else { log.Info("Couldn't find end pattern!"); } m = startPattern.Matcher(finalText); } editorPane.Revalidate(); editorPane.Repaint(); } else { string untaggedContents = editorPane.GetText(); if (untaggedContents == null) { untaggedContents = string.Empty; } taggedContents = classifier.ClassifyWithInlineXML(untaggedContents); ICollection <string> tags = classifier.Labels(); string background = classifier.BackgroundSymbol(); StringBuilder tagPattern = new StringBuilder(); foreach (string tag in tags) { if (background.Equals(tag)) { continue; } if (tagPattern.Length > 0) { tagPattern.Append('|'); } tagPattern.Append(tag); } Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>"); Pattern endPattern = Pattern.Compile("</(" + tagPattern + ")>"); string finalText = taggedContents; Matcher m = startPattern.Matcher(finalText); while (m.Find()) { string tag_1 = m.Group(1); Color col = tagToColorMap[tag_1]; if (col != null) { string color = ColorToHTML(col); string newTag = "<span style=\"background-color: " + color + "; color: white\">"; finalText = m.ReplaceFirst(newTag); int start = m.Start() + newTag.Length; Matcher m1 = endPattern.Matcher(finalText); if (m1.Find(m.End())) { string entity = Sharpen.Runtime.Substring(finalText, start, m1.Start()); log.Info(tag_1 + ": " + entity); } else { log.Warn("Failed to find end for " + tag_1); } finalText = m1.ReplaceFirst("</span>"); m = startPattern.Matcher(finalText); } } // System.out.println(finalText); editorPane.SetText(finalText); editorPane.Revalidate(); editorPane.Repaint(); } // log.info(finalText); saveTaggedAs.SetEnabled(true); }