static void Main()
        {
            // Path to the folder with models
            var segmenterData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\data";
            var sampleData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\test.simp.utf8";

            // `test.simple.utf8` contains following text:
            // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫
            // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();
            props.setProperty("sighanCorporaDict", segmenterData);
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);
            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(sampleData);
        }
Beispiel #2
0
        public void ExtractNeFromFile()
        {
            var filePath    = Files.NER.Classifier("english.all.3class.distsim.crf.ser.gz");
            var classifier  = CRFClassifier.getClassifierNoExceptions(filePath);
            var fileContent = System.IO.File.ReadAllText(Files.DataFile("SampleText.txt"));

            var sentences = classifier.classify(fileContent).toArray();

            Assert.NotNull(sentences);

            var key = new CoreAnnotations.AnswerAnnotation().getClass();

            foreach (java.util.List rawSentence in sentences)
            {
                var sentence = rawSentence.toArray();
                Assert.NotNull(sentence);

                foreach (CoreLabel word in sentence)
                {
                    var annotation = word.get(key);
                    Assert.NotNull(annotation);

                    TestContext.Out.WriteLine($"{word.word()}/{annotation}");
                }
                TestContext.Out.WriteLine();
            }
        }
Beispiel #3
0
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            Runtime.SetOut(new TextWriter(System.Console.Out, true, "utf-8"));
            Properties props = new Properties();

            props.SetProperty("sighanCorporaDict", basedir);
            // props.setProperty("NormalizationTable", "data/norm.simp.utf8");
            // props.setProperty("normTableEncoding", "UTF-8");
            // below is needed because CTBSegDocumentIteratorFactory accesses it
            props.SetProperty("serDictionary", basedir + "/dict-chris6.ser.gz");
            if (args.Length > 0)
            {
                props.SetProperty("testFile", args[0]);
            }
            props.SetProperty("inputEncoding", "UTF-8");
            props.SetProperty("sighanPostProcessing", "true");
            CRFClassifier <CoreLabel> segmenter = new CRFClassifier <CoreLabel>(props);

            segmenter.LoadClassifierNoExceptions(basedir + "/ctb.gz", props);
            foreach (string filename in args)
            {
                segmenter.ClassifyAndWriteAnswers(filename);
            }
            string         sample    = "我住在美国。";
            IList <string> segmented = segmenter.SegmentString(sample);

            System.Console.Out.WriteLine(segmented);
        }
Beispiel #4
0
        static void Main()
        {
            // Path to the folder with models
            var segmenterData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\data";
            var sampleData    = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\test.simp.utf8";

            // `test.simple.utf8` contains following text:
            // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫
            // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();

            props.setProperty("sighanCorporaDict", segmenterData);
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(sampleData);
        }
        public string Location()
        {
            //where the nlp library is stored
            var source = @"C:\Users\chris\Downloads\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz";

            //declare a location string that use later of for identify words such as (Nicosia,Limassol)
            const string location = "LOCATION";

            var classifier = CRFClassifier.getClassifierNoExceptions(source);
            int a          = 0;

            String[] words_array = answer.Split(' ');
            String   output      = "";

            String[] Array = classifier.classifyToString(answer).Split(' ');
            while (a < words_array.Length)
            {
                output = Array[a].ToString();

                if (output.Contains(location))
                {
                    Console.WriteLine(Array[a]);
                    return(words_array[a]);
                }
                else
                {
                    return(words_array[a]);
                }
            }

            return(null);
        }
Beispiel #6
0
 public virtual void LoadDefaultClassifier(bool crf)
 {
     try
     {
         if (crf)
         {
             classifier = CRFClassifier.GetDefaultClassifier();
         }
         else
         {
             classifier = CMMClassifier.GetDefaultClassifier();
         }
     }
     catch (Exception e)
     {
         string message = "Error loading default " + (crf ? "CRF" : "CMM");
         string title   = (crf ? "CRF" : "CMM") + " Load Error";
         message += "\nMessage: " + e.Message;
         DisplayError(title, message);
         return;
     }
     RemoveTags();
     BuildTagPanel();
     BuildExtractButton();
 }
Beispiel #7
0
        public string getNER(string S)
        {
            CRFClassifier Classifier = CRFClassifier.getClassifierNoExceptions(@"C:\english.all.3class.distsim.crf.ser.gz");

            //S = "David go to school at Stanford University, which is located in California.";
            string S3 = S.Trim(new Char[] { ',', '.' });
            string S2 = S3.Replace(@",", "");
            //  Console.WriteLine(S2);
            String classify = Classifier.classifyToString(S2);

            string[] words  = classify.Split(' ');
            string   result = "";

            //List<String> iList = new List<String>();ctory

            //List<String> iList = new List<String>();
            foreach (string s in words)
            {
                if (!s.EndsWith("/O"))
                {
                    //System.Console.WriteLine(s);
                    result = result + s + "\n";
                }
            }

            // Keep the console window open in debug mode.

            return(result);
        }
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            StringUtils.LogInvocationString(log, args);
            Properties props = StringUtils.ArgsToProperties(args);
            CRFClassifier <CoreLabel> crf = new CRFClassifier <CoreLabel>(props);
            string inputFile = crf.flags.trainFile;

            if (inputFile == null)
            {
                log.Info("Please provide input file using -trainFile");
                System.Environment.Exit(-1);
            }
            string outputFile = crf.flags.exportFeatures;

            if (outputFile == null)
            {
                log.Info("Please provide output file using -exportFeatures");
                System.Environment.Exit(-1);
            }
            Edu.Stanford.Nlp.IE.Crf.CRFFeatureExporter <CoreLabel> featureExporter = new Edu.Stanford.Nlp.IE.Crf.CRFFeatureExporter <CoreLabel>(crf);
            ICollection <IList <CoreLabel> > docs = crf.MakeObjectBankFromFile(inputFile, crf.MakeReaderAndWriter());

            crf.MakeAnswerArraysAndTagIndex(docs);
            featureExporter.PrintFeatures(outputFile, docs);
        }
        public void ChineseWordSegmenter()
        {
            var sampleData = Files.Segmenter.Data("../test.simp.utf8");

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();

            props.setProperty("sighanCorporaDict", Files.Segmenter.Root);
            props.setProperty("NormalizationTable", Files.Segmenter.Data("norm.simp.utf8"));
            props.setProperty("normTableEncoding", "UTF-8");
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", Files.Segmenter.Data("dict-chris6.ser.gz"));
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(Files.Segmenter.Data(@"ctb.gz"), props);
            segmenter.classifyAndWriteAnswers(sampleData);

            var sample    = "2008年我住在美国。";
            var segmented = segmenter.segmentString(sample);

            Console.WriteLine(segmented);
        }
        /// <summary>Loads the model from disk.</summary>
        /// <param name="path">The location of model that was saved to disk</param>
        /// <exception cref="System.InvalidCastException">if model is the wrong format</exception>
        /// <exception cref="System.IO.IOException">
        /// if the model file doesn't exist or is otherwise
        /// unavailable/incomplete
        /// </exception>
        /// <exception cref="System.TypeLoadException">this would probably indicate a serious classpath problem</exception>
        public static Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor Load(string path, Type entityClassifier, bool preferDefaultGazetteer)
        {
            // load the additional arguments
            // try to load the extra file from the CLASSPATH first
            InputStream @is = typeof(Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor).GetClassLoader().GetResourceAsStream(path + ".extra");

            // if not found in the CLASSPATH, load from the file system
            if (@is == null)
            {
                @is = new FileInputStream(path + ".extra");
            }
            ObjectInputStream @in = new ObjectInputStream(@is);
            string            gazetteerLocation = ErasureUtils.UncheckedCast <string>(@in.ReadObject());

            if (preferDefaultGazetteer)
            {
                gazetteerLocation = DefaultPaths.DefaultNflGazetteer;
            }
            ICollection <string> annotationsToSkip = ErasureUtils.UncheckedCast <ICollection <string> >(@in.ReadObject());
            bool useSubTypes = ErasureUtils.UncheckedCast <bool>(@in.ReadObject());
            bool useBIO      = ErasureUtils.UncheckedCast <bool>(@in.ReadObject());

            @in.Close();
            @is.Close();
            Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor extractor = (Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor)MachineReading.MakeEntityExtractor(entityClassifier, gazetteerLocation);
            // load the CRF classifier (this works from any resource, e.g., classpath or file system)
            extractor.classifier = CRFClassifier.GetClassifier(path);
            // copy the extra arguments
            extractor.annotationsToSkip = annotationsToSkip;
            extractor.useSubTypes       = useSubTypes;
            extractor.useBIO            = useBIO;
            return(extractor);
        }
Beispiel #11
0
        static void Main(string[] args)
        {
            var jarRoot = @"stanford-ner-2016-10-31";
            var classifiersDirecrory = jarRoot + @"\classifiers";


            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            var rawFileNames    = Directory.GetFiles(@"Texts");
            var markedFileNames = Directory.GetFiles(@"MarkedTexts");

            for (int i = 0; i < rawFileNames.Length; ++i)
            {
                using (var rawReader = new StreamReader(rawFileNames[i]))
                    using (var markedReader = new StreamReader(markedFileNames[i]))
                    {
                        string rawText         = rawReader.ReadToEnd();
                        string rightMarkedText = markedReader.ReadToEnd();

                        var markedText = classifier.classifyWithInlineXML(rawText);
                        Console.WriteLine($"File Name: {Path.GetFileName(rawFileNames[i])}\n");
                        Console.WriteLine($"{markedText}\n\n");

                        Console.WriteLine($"{rightMarkedText}\n");
                    }
            }
        }
Beispiel #12
0
 public static CRFClassifier GetClassifierByLang(string lang)
 {
     if (!classifiers.ContainsKey(lang))
     {
         classifiers.Add(lang,
                         CRFClassifier.getClassifierNoExceptions(classifiersDirectory + StanfordEnv.GetNerLanguageFiles(lang)));
     }
     return(classifiers[lang]);
 }
Beispiel #13
0
        public void LoadClassifier(string classifierPath)
        {
            if (!File.Exists(classifierPath))
            {
                throw new FileNotFoundException($"Could not find the path `{classifierPath}`");
            }

            this.classifier = CRFClassifier.getClassifier(classifierPath);
        }
        private async Task Load3rdParty()
        {
            Loading.Visibility = Visibility.Visible;
            _ocr        = new TesseractEngine("./tessdata", "eng", EngineMode.Default);
            _classifier = await Task.Run(() => CRFClassifier.getClassifierNoExceptions(@"english.all.3class.distsim.crf.ser.gz"));

            RunButton.Content   = "Run";
            RunButton.IsEnabled = true;
            Loading.Visibility  = Visibility.Collapsed;
        }
Beispiel #15
0
 /// <summary>
 /// Initializes a new instance of the <see cref="Preprocessor" /> class.
 /// </summary>
 public Preprocessor()
 {
     listLatestTokenizedArticle = new List <Token>();
     listWhoCandidates          = new List <Candidate>();
     listWhenCandidates         = new List <Candidate>();
     listWhereCandidates        = new List <Candidate>();
     listWhatCandidates         = new List <List <Token> >();
     listWhyCandidates          = new List <List <Token> >();
     nerClassifier = CRFClassifier.getClassifierNoExceptions(nerModelPath);
     posTagger     = new MaxentTagger(posModelPath);
 }
Beispiel #16
0
 public Preprocessor()
 {
     listLatestTokenizedArticle = new List<Token>();
     listWhoCandidates = new List<Candidate>();
     listWhenCandidates = new List<Candidate>();
     listWhereCandidates = new List<Candidate>();
     listWhatCandidates = new List<List<Token>>();
     listWhyCandidates = new List<List<Token>>();
     nerClassifier = CRFClassifier.getClassifierNoExceptions(nerModelPath);
     posTagger = new MaxentTagger(posModelPath);
 }
Beispiel #17
0
        static void TrainAndWrite(string propPath, string modelPath)
        {
            Properties props = StringUtils.propFileToProperties(propPath);

            var flags = new SeqClassifierFlags(props);

            var crf = new CRFClassifier(flags);

            crf.train();
            crf.serializeClassifier(modelPath);
        }
Beispiel #18
0
 public virtual void LoadSegmenter(string filename, Properties p)
 {
     try
     {
         classifier = CRFClassifier.GetClassifier(filename, p);
     }
     catch (Exception e)
     {
         throw new RuntimeIOException("Failed to load segmenter " + filename, e);
     }
 }
Beispiel #19
0
 public NER()
 {
     try
     {
         string root = @"D:\Temp\NER\classifiers";
         Classifier = CRFClassifier.getClassifierNoExceptions(root + @"\english.all.3class.distsim.crf.ser.gz");
     }
     catch (Exception ex)
     {
         Console.WriteLine(ex.ToString());
     }
 }
        /*
         *  Model creation, saving, loading, and saving
         */
        public virtual void Train(Annotation doc)
        {
            IList <IList <CoreLabel> > trainingSet = AnnotationUtils.EntityMentionsToCoreLabels(doc, annotationsToSkip, useSubTypes, useBIO);

            // dump a file in CoNLL-2003 format
            // saveCoNLLFiles("/tmp/ace/train/", doc, useSubTypes, useBIO);
            this.classifier = CreateClassifier();
            if (trainingSet.Count > 0)
            {
                this.classifier.Train(Java.Util.Collections.UnmodifiableCollection(trainingSet));
            }
        }
Beispiel #21
0
 /// <summary>Make an Arabic Segmenter.</summary>
 /// <param name="props">
 /// Options for how to tokenize. See the main method of
 /// <see cref="ArabicTokenizer{T}"/>
 /// for details
 /// </param>
 public ArabicSegmenter(Properties props)
 {
     /* Serializable */
     // SEGMENTER OPTIONS (can be set in the Properties object
     // passed to the constructor).
     // The input already been tokenized. Do not run the Arabic tokenizer.
     // Tokenizer options
     // Mark segmented prefixes with this String
     // Mark segmented suffixes with this String
     // Number of decoding threads
     // Write TedEval files
     // Use a custom feature factory
     // Training and evaluation files have domain labels
     // Training and evaluation text are all in the same domain (default:atb)
     // Ignore rewrites (training only, produces a model that then can be used to do
     // no-rewrite segmentation)
     // Use the original feature set which doesn't contain start-and-end "wrapper" features
     isTokenized      = props.Contains(optTokenized);
     tokenizerOptions = props.GetProperty(optTokenizer, null);
     tedEvalPrefix    = props.GetProperty(optTedEval, null);
     hasDomainLabels  = props.Contains(optWithDomains);
     domain           = props.GetProperty(optDomain, "atb");
     noRewrites       = props.Contains(optNoRewrites);
     tf           = GetTokenizerFactory();
     prefixMarker = props.GetProperty(optPrefix, string.Empty);
     suffixMarker = props.GetProperty(optSuffix, string.Empty);
     if (props.Contains(optLocalFeaturesOnly))
     {
         if (props.Contains(optFeatureFactory))
         {
             throw new Exception("Cannot use custom feature factory with localFeaturesOnly flag--" + "have your custom feature factory extend ArabicSegmenterFeatureFactory instead of " + "StartAndEndArabicSegmenterFeatureFactory and remove the localFeaturesOnly flag."
                                 );
         }
         props.SetProperty(optFeatureFactory, localOnlyFeatureFactory);
     }
     if (!props.Contains(optFeatureFactory))
     {
         props.SetProperty(optFeatureFactory, defaultFeatureFactory);
     }
     // Remove all command-line properties that are specific to ArabicSegmenter
     props.Remove(optTokenizer);
     props.Remove(optTokenized);
     props.Remove(optPrefix);
     props.Remove(optSuffix);
     props.Remove(optThreads);
     props.Remove(optTedEval);
     props.Remove(optWithDomains);
     props.Remove(optDomain);
     props.Remove(optNoRewrites);
     props.Remove(optLocalFeaturesOnly);
     flags      = new SeqClassifierFlags(props);
     classifier = new CRFClassifier <CoreLabel>(flags);
 }
Beispiel #22
0
        private void Rectangle_MouseLeftButtonDown_2(object sender, MouseButtonEventArgs e)
        {
            var jarRoot = @"D:\stanford-ner-2018-10-16";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            conn.Open();
            cmd    = new OleDbCommand("SELECT * From articles", conn);
            reader = cmd.ExecuteReader();
            while (reader.Read())
            {
                var           s1          = reader["TNote"].ToString();
                var           s2          = classifier.classifyWithInlineXML(s1);
                List <string> words       = s2.Split(' ', ',', '<', '>').ToList();
                List <string> person      = new List <string>();
                int           count       = 0;
                int           count1      = 0;
                bool          isTagPerson = false;
                foreach (var word in words)
                {
                    if (word == "/PERSON")
                    {
                        isTagPerson = false;
                        count1     += 1;
                    }

                    if (isTagPerson)
                    {
                        person[count1] = person[count1] + word + " ";
                    }
                    if (word == "PERSON")
                    {
                        isTagPerson = true;
                        person.Add("");
                    }


                    count += 1;
                }

                for (int i = 0; i < count1; i++)
                {
                    OleDbCommand cmd2 = new OleDbCommand("Insert Into AllNames(PName) Values(@PN)", conn);
                    cmd2.Parameters.AddWithValue("PN", person[i]);
                    cmd2.ExecuteNonQuery();
                }
            }
            conn.Close();
        }
 public CRFClassifierEvaluator(string description, CRFClassifier <IN> classifier, ICollection <IList <IN> > data, IList <Triple <int[][][], int[], double[][][]> > featurizedData)
 {
     // TODO: Use data structure to hold data + features
     // Cache already featurized documents
     // Original object bank
     // Featurized data
     this.description    = description;
     this.classifier     = classifier;
     this.data           = data;
     this.featurizedData = featurizedData;
     cmd        = GetCmd(cmdStr);
     saveOutput = true;
 }
        public ChineseSegmenterAnnotator(string name, Properties props)
        {
            string model = null;
            // Keep only the properties that apply to this annotator
            Properties modelProps = new Properties();
            string     desiredKey = name + '.';

            foreach (string key in props.StringPropertyNames())
            {
                if (key.StartsWith(desiredKey))
                {
                    // skip past name and the subsequent "."
                    string modelKey = Sharpen.Runtime.Substring(key, desiredKey.Length);
                    if (modelKey.Equals("model"))
                    {
                        model = props.GetProperty(key);
                    }
                    else
                    {
                        modelProps.SetProperty(modelKey, props.GetProperty(key));
                    }
                }
            }
            this.Verbose        = PropertiesUtils.GetBool(props, name + ".verbose", false);
            this.normalizeSpace = PropertiesUtils.GetBool(props, name + ".normalizeSpace", false);
            if (model == null)
            {
                throw new Exception("Expected a property " + name + ".model");
            }
            // don't write very much, because the CRFClassifier already reports loading
            if (Verbose)
            {
                log.Info("Loading Segmentation Model ... ");
            }
            try
            {
                segmenter = CRFClassifier.GetClassifier(model, modelProps);
            }
            catch (Exception e)
            {
                throw;
            }
            catch (Exception e)
            {
                throw new Exception(e);
            }
            // If newlines are treated as sentence split, we need to retain them in tokenization for ssplit to make use of them
            tokenizeNewline = (!props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("never")) || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));
            // record whether or not sentence splitting on two newlines ; if so, need to remove single newlines
            sentenceSplitOnTwoNewlines = props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("two");
        }
Beispiel #25
0
        private static void OutputHighlighting(PrintWriter @out, CRFClassifier <ICoreMap> classifier, string input)
        {
            ICollection <string> labels               = classifier.Labels();
            string background                         = classifier.BackgroundSymbol();
            IList <IList <ICoreMap> >   sentences     = classifier.Classify(input);
            IDictionary <string, Color> tagToColorMap = NERGUI.MakeTagToColorMap(labels, background);
            StringBuilder result                      = new StringBuilder();
            int           lastEndOffset               = 0;

            foreach (IList <ICoreMap> sentence in sentences)
            {
                foreach (ICoreMap word in sentence)
                {
                    int    beginOffset = word.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    int    endOffset   = word.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    string answer      = word.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    if (beginOffset > lastEndOffset)
                    {
                        result.Append(StringEscapeUtils.EscapeHtml4(Sharpen.Runtime.Substring(input, lastEndOffset, beginOffset)));
                    }
                    // Add a color bar for any tagged words
                    if (!background.Equals(answer))
                    {
                        Color color = tagToColorMap[answer];
                        result.Append("<span style=\"color:#ffffff;background:" + NERGUI.ColorToHTML(color) + "\">");
                    }
                    result.Append(StringEscapeUtils.EscapeHtml4(Sharpen.Runtime.Substring(input, beginOffset, endOffset)));
                    // Turn off the color bar
                    if (!background.Equals(answer))
                    {
                        result.Append("</span>");
                    }
                    lastEndOffset = endOffset;
                }
            }
            if (lastEndOffset < input.Length)
            {
                result.Append(StringEscapeUtils.EscapeHtml4(Sharpen.Runtime.Substring(input, lastEndOffset)));
            }
            result.Append("<br><br>");
            result.Append("Potential tags:");
            foreach (KeyValuePair <string, Color> stringColorEntry in tagToColorMap)
            {
                result.Append("<br>&nbsp;&nbsp;");
                Color color = stringColorEntry.Value;
                result.Append("<span style=\"color:#ffffff;background:" + NERGUI.ColorToHTML(color) + "\">");
                result.Append(StringEscapeUtils.EscapeHtml4(stringColorEntry.Key));
                result.Append("</span>");
            }
            @out.Print(result);
        }
Beispiel #26
0
 /// <summary>Copy constructor.</summary>
 /// <param name="other"/>
 public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other)
 {
     isTokenized      = other.isTokenized;
     tokenizerOptions = other.tokenizerOptions;
     prefixMarker     = other.prefixMarker;
     suffixMarker     = other.suffixMarker;
     tedEvalPrefix    = other.tedEvalPrefix;
     hasDomainLabels  = other.hasDomainLabels;
     domain           = other.domain;
     noRewrites       = other.noRewrites;
     flags            = other.flags;
     // ArabicTokenizerFactory is *not* threadsafe. Make a new copy.
     tf = GetTokenizerFactory();
     // CRFClassifier is threadsafe, so return a reference.
     classifier = other.classifier;
 }
Beispiel #27
0
        private static string GetNLPResults(string story)
        {
            string results;

            // Path to the folder with classifiers models
            string baseDirectory        = AppDomain.CurrentDomain.BaseDirectory;
            string classifiersDirectory = baseDirectory + @"..\DirectSupply.Anonymize.Service\Models\NLP";

            // Loading 3 class classifier model
            CRFClassifier classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirectory + @"\english.all.3class.distsim.crf.ser.gz");

            results = classifier.classifyWithInlineXML(story);

            return(results);
        }
Beispiel #28
0
        static void Main(string[] args)
        {
            var propPath  = @"..\..\train.prop";
            var modelPath = @"..\..\ner-model.ser.gz";

            TrainAndWrite(propPath, modelPath);

            var crf = CRFClassifier.getClassifierNoExceptions(modelPath);

            String[] tests = new String[] { "apple watch", "samsung mobile phones", " lcd 52 inch tv" };

            foreach (String item in tests)
            {
                DoTagging(crf, item);
            }
        }
Beispiel #29
0
        public void createModelFromTrainingData(string inputPath, string outputPath, string properties)
        {
            Properties props = edu.stanford.nlp.util.StringUtils.propFileToProperties(properties);

            props.setProperty("serializeTo", outputPath);

            if (inputPath != null)
            {
                props.setProperty("trainFile", inputPath);
            }

            SeqClassifierFlags flags = new SeqClassifierFlags(props);
            CRFClassifier      crf   = new CRFClassifier(flags);

            crf.train();
            crf.serializeClassifier(outputPath);
        }
Beispiel #30
0
        private async Task Init()
        {
            // Path to the folder with classifiers models
            var jarRoot = @"C:\stanford-ner-2018-10-16";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            _classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            // Define a regular expression for finding the location element
            _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>",
                                    RegexOptions.Compiled | RegexOptions.IgnoreCase);

            // Define configurations for parsing artist and listener info
            var configArtistInfoJson = @"
            {
                'artist': '//h1[contains(@class, \'view-header\')]',
                'about': '//div[contains(@class, \'bio-primary\')]',
                'more': '//div[contains(@class, \'bio-secondary\')]',
                'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]',
                'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]'
            }";

            ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson);

            _artistScraping = new StructuredDataExtractor(configArtist);

            // Get the hosted feature layers for editing
            ArcGISPortal portal = await ArcGISPortal.CreateAsync();

            PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId);

            PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId);

            PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId);

            _hometownTable    = new ServiceFeatureTable(hometownLayerItem, 0);
            _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0);
            _listenerTable    = new ServiceFeatureTable(listenerLayerItem, 0);
            await _hometownTable.LoadAsync();

            await _otherPointsTable.LoadAsync();

            await _listenerTable.LoadAsync();
        }
Beispiel #31
0
        public static List <(string, string)> ClassifyWordsWithTypes(this CRFClassifier classifier, string sentences)
        {
            List <(string, string)> results = new List <(string, string)>();

            string xmlResults = classifier.classifyWithInlineXML(sentences);

            foreach (Match match in Regex.Matches(xmlResults, @"<(?<tag>[^<>]*)>(?<word>[^<>]*)<\/[^<>]*>"))
            {
                string tag  = match.Groups["tag"].Value;
                string word = match.Groups["word"].Value;

                if (!string.IsNullOrWhiteSpace(tag))
                {
                    results.Add((word, tag));
                }
            }

            return(results);
        }
Beispiel #32
0
        /// <summary>
        /// Releases unmanaged and - optionally - managed resources
        /// </summary>
        /// <param name="disposing"><c>true</c> to release both managed and unmanaged resources; <c>false</c> to release only unmanaged resources.</param>
        /// <remarks>
        /// If the main class was marked as sealed, we could just make this a private void Dispose(bool).  Alternatively, we could (in this case) put
        /// all of our logic directly in Dispose().
        /// </remarks>
        public virtual void Dispose(bool disposing)
        {
            // Use our disposed flag to allow us to call this method multiple times safely.
            // This is a requirement when implementing IDisposable
            if (!this.disposed)
            {
                if (disposing)
                {
                    // If we have any managed, IDisposable resources, Dispose of them here.
                    // In this case, we don't, so this was unneeded.
                    // Later, we will subclass this class, and use this section.

                    Classifier = null;
                }
            }
            // Mark us as disposed, to prevent multiple calls to dispose from having an effect,
            // and to allow us to handle ObjectDisposedException
            this.disposed = true;
        }
        static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename");
                return;
            }

            var props = new Properties();
            props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data");
            // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8");
            // props.setProperty("normTableEncoding", "UTF-8");
            // below is needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\dict-chris6.ser.gz");
            props.setProperty("testFile", args[0]);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            var segmenter = new CRFClassifier(props);
            segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(args[0]);
        }