예제 #1
0
 private void Extract()
 {
     log.Info("content type: " + editorPane.GetContentType());
     if (!editorPane.GetContentType().Equals("text/html"))
     {
         DefaultStyledDocument doc = (DefaultStyledDocument)editorPane.GetDocument();
         string text = null;
         try
         {
             text = doc.GetText(0, doc.GetLength());
         }
         catch (Exception e)
         {
             log.Err(e);
         }
         string labeledText = classifier.ClassifyWithInlineXML(text);
         taggedContents = labeledText;
         ICollection <string> tags = classifier.Labels();
         string        background  = classifier.BackgroundSymbol();
         StringBuilder tagPattern  = new StringBuilder();
         foreach (string tag in tags)
         {
             if (background.Equals(tag))
             {
                 continue;
             }
             if (tagPattern.Length > 0)
             {
                 tagPattern.Append('|');
             }
             tagPattern.Append(tag);
         }
         Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>");
         Pattern endPattern   = Pattern.Compile("</(" + tagPattern + ")>");
         string  finalText    = labeledText;
         Matcher m            = startPattern.Matcher(finalText);
         while (m.Find())
         {
             int start = m.Start();
             finalText = m.ReplaceFirst(string.Empty);
             m         = endPattern.Matcher(finalText);
             if (m.Find())
             {
                 int    end   = m.Start();
                 string tag_1 = m.Group(1);
                 finalText = m.ReplaceFirst(string.Empty);
                 IAttributeSet attSet = GetAttributeSet(tag_1);
                 try
                 {
                     string entity = Sharpen.Runtime.Substring(finalText, start, end);
                     doc.SetCharacterAttributes(start, entity.Length, attSet, false);
                 }
                 catch (Exception ex)
                 {
                     log.Err(ex);
                     System.Environment.Exit(-1);
                 }
                 log.Info(tag_1 + ": " + Sharpen.Runtime.Substring(finalText, start, end));
             }
             else
             {
                 log.Info("Couldn't find end pattern!");
             }
             m = startPattern.Matcher(finalText);
         }
         editorPane.Revalidate();
         editorPane.Repaint();
     }
     else
     {
         string untaggedContents = editorPane.GetText();
         if (untaggedContents == null)
         {
             untaggedContents = string.Empty;
         }
         taggedContents = classifier.ClassifyWithInlineXML(untaggedContents);
         ICollection <string> tags = classifier.Labels();
         string        background  = classifier.BackgroundSymbol();
         StringBuilder tagPattern  = new StringBuilder();
         foreach (string tag in tags)
         {
             if (background.Equals(tag))
             {
                 continue;
             }
             if (tagPattern.Length > 0)
             {
                 tagPattern.Append('|');
             }
             tagPattern.Append(tag);
         }
         Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>");
         Pattern endPattern   = Pattern.Compile("</(" + tagPattern + ")>");
         string  finalText    = taggedContents;
         Matcher m            = startPattern.Matcher(finalText);
         while (m.Find())
         {
             string tag_1 = m.Group(1);
             Color  col   = tagToColorMap[tag_1];
             if (col != null)
             {
                 string color  = ColorToHTML(col);
                 string newTag = "<span style=\"background-color: " + color + "; color: white\">";
                 finalText = m.ReplaceFirst(newTag);
                 int     start = m.Start() + newTag.Length;
                 Matcher m1    = endPattern.Matcher(finalText);
                 if (m1.Find(m.End()))
                 {
                     string entity = Sharpen.Runtime.Substring(finalText, start, m1.Start());
                     log.Info(tag_1 + ": " + entity);
                 }
                 else
                 {
                     log.Warn("Failed to find end for " + tag_1);
                 }
                 finalText = m1.ReplaceFirst("</span>");
                 m         = startPattern.Matcher(finalText);
             }
         }
         // System.out.println(finalText);
         editorPane.SetText(finalText);
         editorPane.Revalidate();
         editorPane.Repaint();
     }
     // log.info(finalText);
     saveTaggedAs.SetEnabled(true);
 }
예제 #2
0
 private void Extract()
 {
     log.Info("content type: " + editorPane.GetContentType());
     if (!editorPane.GetContentType().Equals("text/html"))
     {
         DefaultStyledDocument doc = (DefaultStyledDocument)editorPane.GetDocument();
         string text = null;
         try
         {
             text = doc.GetText(0, doc.GetLength());
         }
         catch (Exception e)
         {
             throw new Exception(e);
         }
         string labeledText = classifier.ClassifyWithInlineXML(text);
         taggedContents   = labeledText;
         untaggedContents = text;
         ICollection <string> tags = classifier.Labels();
         string background         = classifier.BackgroundSymbol();
         string tagPattern         = string.Empty;
         foreach (string tag in tags)
         {
             if (background.Equals(tag))
             {
                 continue;
             }
             if (tagPattern.Length > 0)
             {
                 tagPattern += "|";
             }
             tagPattern += tag;
         }
         Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>");
         Pattern endPattern   = Pattern.Compile("</(" + tagPattern + ")>");
         string  finalText    = labeledText;
         Matcher m            = startPattern.Matcher(finalText);
         while (m.Find())
         {
             int start = m.Start();
             finalText = m.ReplaceFirst(string.Empty);
             m         = endPattern.Matcher(finalText);
             if (m.Find())
             {
                 int    end   = m.Start();
                 string tag_1 = m.Group(1);
                 finalText = m.ReplaceFirst(string.Empty);
                 IAttributeSet attSet = GetAttributeSet(tag_1);
                 try
                 {
                     string entity = Sharpen.Runtime.Substring(finalText, start, end);
                     doc.SetCharacterAttributes(start, entity.Length, attSet, false);
                 }
                 catch (Exception ex)
                 {
                     throw new Exception(ex);
                 }
                 log.Info(tag_1 + ": " + Sharpen.Runtime.Substring(finalText, start, end));
             }
             // print error message
             m = startPattern.Matcher(finalText);
         }
         editorPane.Revalidate();
         editorPane.Repaint();
     }
     else
     {
         untaggedContents = editorPane.GetText();
         taggedContents   = classifier.ClassifyWithInlineXML(untaggedContents);
         ICollection <string> tags = classifier.Labels();
         string background         = classifier.BackgroundSymbol();
         string tagPattern         = string.Empty;
         foreach (string tag in tags)
         {
             if (background.Equals(tag))
             {
                 continue;
             }
             if (tagPattern.Length > 0)
             {
                 tagPattern += "|";
             }
             tagPattern += tag;
         }
         Pattern startPattern = Pattern.Compile("<(" + tagPattern + ")>");
         Pattern endPattern   = Pattern.Compile("</(" + tagPattern + ")>");
         string  finalText    = taggedContents;
         Matcher m            = startPattern.Matcher(finalText);
         while (m.Find())
         {
             string tag_1  = m.Group(1);
             string color  = ColorToHTML(tagToColorMap[tag_1]);
             string newTag = "<span style=\"background-color: " + color + "; color: white\">";
             finalText = m.ReplaceFirst(newTag);
             int     start = m.Start() + newTag.Length;
             Matcher m1    = endPattern.Matcher(finalText);
             m1.Find(m.End());
             string entity = Sharpen.Runtime.Substring(finalText, start, m1.Start());
             log.Info(tag_1 + ": " + entity);
             finalText = m1.ReplaceFirst("</span>");
             m         = startPattern.Matcher(finalText);
         }
         System.Console.Out.WriteLine(finalText);
         editorPane.SetText(finalText);
         editorPane.Revalidate();
         editorPane.Repaint();
         log.Info(finalText);
     }
     saveTaggedAs.SetEnabled(true);
 }
예제 #3
0
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            string serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";

            if (args.Length > 0)
            {
                serializedClassifier = args[0];
            }
            AbstractSequenceClassifier <CoreLabel> classifier = CRFClassifier.GetClassifier(serializedClassifier);

            /* For either a file to annotate or for the hardcoded text example, this
             * demo file shows several ways to process the input, for teaching purposes.
             */
            if (args.Length > 1)
            {
                /* For the file, it shows (1) how to run NER on a String, (2) how
                 * to get the entities in the String with character offsets, and
                 * (3) how to run NER on a whole file (without loading it into a String).
                 */
                string fileContents             = IOUtils.SlurpFile(args[1]);
                IList <IList <CoreLabel> > @out = classifier.Classify(fileContents);
                foreach (IList <CoreLabel> sentence in @out)
                {
                    foreach (CoreLabel word in sentence)
                    {
                        System.Console.Out.Write(word.Word() + '/' + word.Get(typeof(CoreAnnotations.AnswerAnnotation)) + ' ');
                    }
                    System.Console.Out.WriteLine();
                }
                System.Console.Out.WriteLine("---");
                @out = classifier.ClassifyFile(args[1]);
                foreach (IList <CoreLabel> sentence_1 in @out)
                {
                    foreach (CoreLabel word in sentence_1)
                    {
                        System.Console.Out.Write(word.Word() + '/' + word.Get(typeof(CoreAnnotations.AnswerAnnotation)) + ' ');
                    }
                    System.Console.Out.WriteLine();
                }
                System.Console.Out.WriteLine("---");
                IList <Triple <string, int, int> > list = classifier.ClassifyToCharacterOffsets(fileContents);
                foreach (Triple <string, int, int> item in list)
                {
                    System.Console.Out.WriteLine(item.First() + ": " + Sharpen.Runtime.Substring(fileContents, item.Second(), item.Third()));
                }
                System.Console.Out.WriteLine("---");
                System.Console.Out.WriteLine("Ten best entity labelings");
                IDocumentReaderAndWriter <CoreLabel> readerAndWriter = classifier.MakePlainTextReaderAndWriter();
                classifier.ClassifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);
                System.Console.Out.WriteLine("---");
                System.Console.Out.WriteLine("Per-token marginalized probabilities");
                classifier.PrintProbs(args[1], readerAndWriter);
            }
            else
            {
                // -- This code prints out the first order (token pair) clique probabilities.
                // -- But that output is a bit overwhelming, so we leave it commented out by default.
                // System.out.println("---");
                // System.out.println("First Order Clique Probabilities");
                // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);

                /* For the hard-coded String, it shows how to run it on a single
                 * sentence, and how to do this and produce several formats, including
                 * slash tags and an inline XML output format. It also shows the full
                 * contents of the {@code CoreLabel}s that are constructed by the
                 * classifier. And it shows getting out the probabilities of different
                 * assignments and an n-best list of classifications with probabilities.
                 */
                string[] example = new string[] { "Good afternoon Rajat Raina, how are you today?", "I go to school at Stanford University, which is located in California." };
                foreach (string str in example)
                {
                    System.Console.Out.WriteLine(classifier.ClassifyToString(str));
                }
                System.Console.Out.WriteLine("---");
                foreach (string str_1 in example)
                {
                    // This one puts in spaces and newlines between tokens, so just print not println.
                    System.Console.Out.Write(classifier.ClassifyToString(str_1, "slashTags", false));
                }
                System.Console.Out.WriteLine("---");
                foreach (string str_2 in example)
                {
                    // This one is best for dealing with the output as a TSV (tab-separated column) file.
                    // The first column gives entities, the second their classes, and the third the remaining text in a document
                    System.Console.Out.Write(classifier.ClassifyToString(str_2, "tabbedEntities", false));
                }
                System.Console.Out.WriteLine("---");
                foreach (string str_3 in example)
                {
                    System.Console.Out.WriteLine(classifier.ClassifyWithInlineXML(str_3));
                }
                System.Console.Out.WriteLine("---");
                foreach (string str_4 in example)
                {
                    System.Console.Out.WriteLine(classifier.ClassifyToString(str_4, "xml", true));
                }
                System.Console.Out.WriteLine("---");
                foreach (string str_5 in example)
                {
                    System.Console.Out.Write(classifier.ClassifyToString(str_5, "tsv", false));
                }
                System.Console.Out.WriteLine("---");
                // This gets out entities with character offsets
                int j = 0;
                foreach (string str_6 in example)
                {
                    j++;
                    IList <Triple <string, int, int> > triples = classifier.ClassifyToCharacterOffsets(str_6);
                    foreach (Triple <string, int, int> trip in triples)
                    {
                        System.Console.Out.Printf("%s over character offsets [%d, %d) in sentence %d.%n", trip.First(), trip.Second(), trip.third, j);
                    }
                }
                System.Console.Out.WriteLine("---");
                // This prints out all the details of what is stored for each token
                int i = 0;
                foreach (string str_7 in example)
                {
                    foreach (IList <CoreLabel> lcl in classifier.Classify(str_7))
                    {
                        foreach (CoreLabel cl in lcl)
                        {
                            System.Console.Out.Write(i++ + ": ");
                            System.Console.Out.WriteLine(cl.ToShorterString());
                        }
                    }
                }
                System.Console.Out.WriteLine("---");
            }
        }