public QuoteAttributionAnnotator(Properties props) { // settings // these paths go in the props file // fields Verbose = PropertiesUtils.GetBool(props, "verbose", false); Timing timer = null; CorefPath = props.GetProperty("booknlpCoref", null); if (CorefPath == null && Verbose) { log.Err("Warning: no coreference map!"); } ModelPath = props.GetProperty("modelPath", DefaultModelPath); CharactersFile = props.GetProperty("charactersPath", null); if (CharactersFile == null && Verbose) { log.Err("Warning: no characters file!"); } qmSieveList = props.GetProperty("QMSieves", DefaultQmsieves); msSieveList = props.GetProperty("MSSieves", DefaultMssieves); if (Verbose) { timer = new Timing(); log.Info("Loading QuoteAttribution coref [" + CorefPath + "]..."); log.Info("Loading QuoteAttribution characters [" + CharactersFile + "]..."); } // loading all our word lists FamilyWordList = props.GetProperty("familyWordsFile", FamilyWordList); AnimacyWordList = props.GetProperty("animacyWordsFile", AnimacyWordList); GenderWordList = props.GetProperty("genderNamesFile", GenderWordList); familyRelations = QuoteAttributionUtils.ReadFamilyRelations(FamilyWordList); genderMap = QuoteAttributionUtils.ReadGenderedNounList(GenderWordList); animacyList = QuoteAttributionUtils.ReadAnimacyList(AnimacyWordList); if (characterMap != null) { characterMap = QuoteAttributionUtils.ReadPersonMap(CharactersFile); } else { buildCharacterMapPerAnnotation = true; } // use Stanford CoreNLP coref to map mentions to canonical mentions useCoref = PropertiesUtils.GetBool(props, "useCoref", useCoref); if (Verbose) { timer.Stop("done."); } }
private void Extract() { log.Info("content type: " + editorPane.GetContentType()); if (!editorPane.GetContentType().Equals("text/html")) { DefaultStyledDocument doc = (DefaultStyledDocument)editorPane.GetDocument(); string text = null; try { text = doc.GetText(0, doc.GetLength()); } catch (Exception e) { log.Err(e); } string labeledText = classifier.ClassifyWithInlineXML(text); taggedContents = labeledText; ICollection <string> tags = classifier.Labels(); string background = classifier.BackgroundSymbol(); StringBuilder tagPattern = new StringBuilder(); foreach (string tag in tags) { if (background.Equals(tag)) { continue; } if (tagPattern.Length > 0) { tagPattern.Append('|'); } tagPattern.Append(tag); } Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>"); Pattern endPattern = Pattern.Compile("</(" + tagPattern + ")>"); string finalText = labeledText; Matcher m = startPattern.Matcher(finalText); while (m.Find()) { int start = m.Start(); finalText = m.ReplaceFirst(string.Empty); m = endPattern.Matcher(finalText); if (m.Find()) { int end = m.Start(); string tag_1 = m.Group(1); finalText = m.ReplaceFirst(string.Empty); IAttributeSet attSet = GetAttributeSet(tag_1); try { string entity = Sharpen.Runtime.Substring(finalText, start, end); doc.SetCharacterAttributes(start, entity.Length, attSet, false); } catch (Exception ex) { log.Err(ex); System.Environment.Exit(-1); } log.Info(tag_1 + ": " + Sharpen.Runtime.Substring(finalText, start, end)); } else { log.Info("Couldn't find end pattern!"); } m = startPattern.Matcher(finalText); } editorPane.Revalidate(); editorPane.Repaint(); } else { string untaggedContents = editorPane.GetText(); if (untaggedContents == null) { untaggedContents = string.Empty; } taggedContents = classifier.ClassifyWithInlineXML(untaggedContents); ICollection <string> tags = classifier.Labels(); string background = classifier.BackgroundSymbol(); StringBuilder tagPattern = new StringBuilder(); foreach (string tag in tags) { if (background.Equals(tag)) { continue; } if (tagPattern.Length > 0) { tagPattern.Append('|'); } tagPattern.Append(tag); } Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>"); Pattern endPattern = Pattern.Compile("</(" + tagPattern + ")>"); string finalText = taggedContents; Matcher m = startPattern.Matcher(finalText); while (m.Find()) { string tag_1 = m.Group(1); Color col = tagToColorMap[tag_1]; if (col != null) { string color = ColorToHTML(col); string newTag = "<span style=\"background-color: " + color + "; color: white\">"; finalText = m.ReplaceFirst(newTag); int start = m.Start() + newTag.Length; Matcher m1 = endPattern.Matcher(finalText); if (m1.Find(m.End())) { string entity = Sharpen.Runtime.Substring(finalText, start, m1.Start()); log.Info(tag_1 + ": " + entity); } else { log.Warn("Failed to find end for " + tag_1); } finalText = m1.ReplaceFirst("</span>"); m = startPattern.Matcher(finalText); } } // System.out.println(finalText); editorPane.SetText(finalText); editorPane.Revalidate(); editorPane.Repaint(); } // log.info(finalText); saveTaggedAs.SetEnabled(true); }