static void Main() { // Path to the folder with models var segmenterData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\data"; var sampleData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\test.simp.utf8"; // `test.simple.utf8` contains following text: // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫 // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。 // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); segmenter.classifyAndWriteAnswers(sampleData); }
public void ExtractNeFromFile() { var filePath = Files.NER.Classifier("english.all.3class.distsim.crf.ser.gz"); var classifier = CRFClassifier.getClassifierNoExceptions(filePath); var fileContent = System.IO.File.ReadAllText(Files.DataFile("SampleText.txt")); var sentences = classifier.classify(fileContent).toArray(); Assert.NotNull(sentences); var key = new CoreAnnotations.AnswerAnnotation().getClass(); foreach (java.util.List rawSentence in sentences) { var sentence = rawSentence.toArray(); Assert.NotNull(sentence); foreach (CoreLabel word in sentence) { var annotation = word.get(key); Assert.NotNull(annotation); TestContext.Out.WriteLine($"{word.word()}/{annotation}"); } TestContext.Out.WriteLine(); } }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { Runtime.SetOut(new TextWriter(System.Console.Out, true, "utf-8")); Properties props = new Properties(); props.SetProperty("sighanCorporaDict", basedir); // props.setProperty("NormalizationTable", "data/norm.simp.utf8"); // props.setProperty("normTableEncoding", "UTF-8"); // below is needed because CTBSegDocumentIteratorFactory accesses it props.SetProperty("serDictionary", basedir + "/dict-chris6.ser.gz"); if (args.Length > 0) { props.SetProperty("testFile", args[0]); } props.SetProperty("inputEncoding", "UTF-8"); props.SetProperty("sighanPostProcessing", "true"); CRFClassifier <CoreLabel> segmenter = new CRFClassifier <CoreLabel>(props); segmenter.LoadClassifierNoExceptions(basedir + "/ctb.gz", props); foreach (string filename in args) { segmenter.ClassifyAndWriteAnswers(filename); } string sample = "我住在美国。"; IList <string> segmented = segmenter.SegmentString(sample); System.Console.Out.WriteLine(segmented); }
static void Main() { // Path to the folder with models var segmenterData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\data"; var sampleData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\test.simp.utf8"; // `test.simple.utf8` contains following text: // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫 // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。 // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); segmenter.classifyAndWriteAnswers(sampleData); }
public string Location() { //where the nlp library is stored var source = @"C:\Users\chris\Downloads\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz"; //declare a location string that use later of for identify words such as (Nicosia,Limassol) const string location = "LOCATION"; var classifier = CRFClassifier.getClassifierNoExceptions(source); int a = 0; String[] words_array = answer.Split(' '); String output = ""; String[] Array = classifier.classifyToString(answer).Split(' '); while (a < words_array.Length) { output = Array[a].ToString(); if (output.Contains(location)) { Console.WriteLine(Array[a]); return(words_array[a]); } else { return(words_array[a]); } } return(null); }
public virtual void LoadDefaultClassifier(bool crf) { try { if (crf) { classifier = CRFClassifier.GetDefaultClassifier(); } else { classifier = CMMClassifier.GetDefaultClassifier(); } } catch (Exception e) { string message = "Error loading default " + (crf ? "CRF" : "CMM"); string title = (crf ? "CRF" : "CMM") + " Load Error"; message += "\nMessage: " + e.Message; DisplayError(title, message); return; } RemoveTags(); BuildTagPanel(); BuildExtractButton(); }
public string getNER(string S) { CRFClassifier Classifier = CRFClassifier.getClassifierNoExceptions(@"C:\english.all.3class.distsim.crf.ser.gz"); //S = "David go to school at Stanford University, which is located in California."; string S3 = S.Trim(new Char[] { ',', '.' }); string S2 = S3.Replace(@",", ""); // Console.WriteLine(S2); String classify = Classifier.classifyToString(S2); string[] words = classify.Split(' '); string result = ""; //List<String> iList = new List<String>();ctory //List<String> iList = new List<String>(); foreach (string s in words) { if (!s.EndsWith("/O")) { //System.Console.WriteLine(s); result = result + s + "\n"; } } // Keep the console window open in debug mode. return(result); }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { StringUtils.LogInvocationString(log, args); Properties props = StringUtils.ArgsToProperties(args); CRFClassifier <CoreLabel> crf = new CRFClassifier <CoreLabel>(props); string inputFile = crf.flags.trainFile; if (inputFile == null) { log.Info("Please provide input file using -trainFile"); System.Environment.Exit(-1); } string outputFile = crf.flags.exportFeatures; if (outputFile == null) { log.Info("Please provide output file using -exportFeatures"); System.Environment.Exit(-1); } Edu.Stanford.Nlp.IE.Crf.CRFFeatureExporter <CoreLabel> featureExporter = new Edu.Stanford.Nlp.IE.Crf.CRFFeatureExporter <CoreLabel>(crf); ICollection <IList <CoreLabel> > docs = crf.MakeObjectBankFromFile(inputFile, crf.MakeReaderAndWriter()); crf.MakeAnswerArraysAndTagIndex(docs); featureExporter.PrintFeatures(outputFile, docs); }
public void ChineseWordSegmenter() { var sampleData = Files.Segmenter.Data("../test.simp.utf8"); // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", Files.Segmenter.Root); props.setProperty("NormalizationTable", Files.Segmenter.Data("norm.simp.utf8")); props.setProperty("normTableEncoding", "UTF-8"); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", Files.Segmenter.Data("dict-chris6.ser.gz")); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(Files.Segmenter.Data(@"ctb.gz"), props); segmenter.classifyAndWriteAnswers(sampleData); var sample = "2008年我住在美国。"; var segmented = segmenter.segmentString(sample); Console.WriteLine(segmented); }
/// <summary>Loads the model from disk.</summary> /// <param name="path">The location of model that was saved to disk</param> /// <exception cref="System.InvalidCastException">if model is the wrong format</exception> /// <exception cref="System.IO.IOException"> /// if the model file doesn't exist or is otherwise /// unavailable/incomplete /// </exception> /// <exception cref="System.TypeLoadException">this would probably indicate a serious classpath problem</exception> public static Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor Load(string path, Type entityClassifier, bool preferDefaultGazetteer) { // load the additional arguments // try to load the extra file from the CLASSPATH first InputStream @is = typeof(Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor).GetClassLoader().GetResourceAsStream(path + ".extra"); // if not found in the CLASSPATH, load from the file system if (@is == null) { @is = new FileInputStream(path + ".extra"); } ObjectInputStream @in = new ObjectInputStream(@is); string gazetteerLocation = ErasureUtils.UncheckedCast <string>(@in.ReadObject()); if (preferDefaultGazetteer) { gazetteerLocation = DefaultPaths.DefaultNflGazetteer; } ICollection <string> annotationsToSkip = ErasureUtils.UncheckedCast <ICollection <string> >(@in.ReadObject()); bool useSubTypes = ErasureUtils.UncheckedCast <bool>(@in.ReadObject()); bool useBIO = ErasureUtils.UncheckedCast <bool>(@in.ReadObject()); @in.Close(); @is.Close(); Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor extractor = (Edu.Stanford.Nlp.IE.Machinereading.BasicEntityExtractor)MachineReading.MakeEntityExtractor(entityClassifier, gazetteerLocation); // load the CRF classifier (this works from any resource, e.g., classpath or file system) extractor.classifier = CRFClassifier.GetClassifier(path); // copy the extra arguments extractor.annotationsToSkip = annotationsToSkip; extractor.useSubTypes = useSubTypes; extractor.useBIO = useBIO; return(extractor); }
static void Main(string[] args) { var jarRoot = @"stanford-ner-2016-10-31"; var classifiersDirecrory = jarRoot + @"\classifiers"; var classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz"); var rawFileNames = Directory.GetFiles(@"Texts"); var markedFileNames = Directory.GetFiles(@"MarkedTexts"); for (int i = 0; i < rawFileNames.Length; ++i) { using (var rawReader = new StreamReader(rawFileNames[i])) using (var markedReader = new StreamReader(markedFileNames[i])) { string rawText = rawReader.ReadToEnd(); string rightMarkedText = markedReader.ReadToEnd(); var markedText = classifier.classifyWithInlineXML(rawText); Console.WriteLine($"File Name: {Path.GetFileName(rawFileNames[i])}\n"); Console.WriteLine($"{markedText}\n\n"); Console.WriteLine($"{rightMarkedText}\n"); } } }
public static CRFClassifier GetClassifierByLang(string lang) { if (!classifiers.ContainsKey(lang)) { classifiers.Add(lang, CRFClassifier.getClassifierNoExceptions(classifiersDirectory + StanfordEnv.GetNerLanguageFiles(lang))); } return(classifiers[lang]); }
public void LoadClassifier(string classifierPath) { if (!File.Exists(classifierPath)) { throw new FileNotFoundException($"Could not find the path `{classifierPath}`"); } this.classifier = CRFClassifier.getClassifier(classifierPath); }
private async Task Load3rdParty() { Loading.Visibility = Visibility.Visible; _ocr = new TesseractEngine("./tessdata", "eng", EngineMode.Default); _classifier = await Task.Run(() => CRFClassifier.getClassifierNoExceptions(@"english.all.3class.distsim.crf.ser.gz")); RunButton.Content = "Run"; RunButton.IsEnabled = true; Loading.Visibility = Visibility.Collapsed; }
/// <summary> /// Initializes a new instance of the <see cref="Preprocessor" /> class. /// </summary> public Preprocessor() { listLatestTokenizedArticle = new List <Token>(); listWhoCandidates = new List <Candidate>(); listWhenCandidates = new List <Candidate>(); listWhereCandidates = new List <Candidate>(); listWhatCandidates = new List <List <Token> >(); listWhyCandidates = new List <List <Token> >(); nerClassifier = CRFClassifier.getClassifierNoExceptions(nerModelPath); posTagger = new MaxentTagger(posModelPath); }
public Preprocessor() { listLatestTokenizedArticle = new List<Token>(); listWhoCandidates = new List<Candidate>(); listWhenCandidates = new List<Candidate>(); listWhereCandidates = new List<Candidate>(); listWhatCandidates = new List<List<Token>>(); listWhyCandidates = new List<List<Token>>(); nerClassifier = CRFClassifier.getClassifierNoExceptions(nerModelPath); posTagger = new MaxentTagger(posModelPath); }
static void TrainAndWrite(string propPath, string modelPath) { Properties props = StringUtils.propFileToProperties(propPath); var flags = new SeqClassifierFlags(props); var crf = new CRFClassifier(flags); crf.train(); crf.serializeClassifier(modelPath); }
public virtual void LoadSegmenter(string filename, Properties p) { try { classifier = CRFClassifier.GetClassifier(filename, p); } catch (Exception e) { throw new RuntimeIOException("Failed to load segmenter " + filename, e); } }
public NER() { try { string root = @"D:\Temp\NER\classifiers"; Classifier = CRFClassifier.getClassifierNoExceptions(root + @"\english.all.3class.distsim.crf.ser.gz"); } catch (Exception ex) { Console.WriteLine(ex.ToString()); } }
/* * Model creation, saving, loading, and saving */ public virtual void Train(Annotation doc) { IList <IList <CoreLabel> > trainingSet = AnnotationUtils.EntityMentionsToCoreLabels(doc, annotationsToSkip, useSubTypes, useBIO); // dump a file in CoNLL-2003 format // saveCoNLLFiles("/tmp/ace/train/", doc, useSubTypes, useBIO); this.classifier = CreateClassifier(); if (trainingSet.Count > 0) { this.classifier.Train(Java.Util.Collections.UnmodifiableCollection(trainingSet)); } }
/// <summary>Make an Arabic Segmenter.</summary> /// <param name="props"> /// Options for how to tokenize. See the main method of /// <see cref="ArabicTokenizer{T}"/> /// for details /// </param> public ArabicSegmenter(Properties props) { /* Serializable */ // SEGMENTER OPTIONS (can be set in the Properties object // passed to the constructor). // The input already been tokenized. Do not run the Arabic tokenizer. // Tokenizer options // Mark segmented prefixes with this String // Mark segmented suffixes with this String // Number of decoding threads // Write TedEval files // Use a custom feature factory // Training and evaluation files have domain labels // Training and evaluation text are all in the same domain (default:atb) // Ignore rewrites (training only, produces a model that then can be used to do // no-rewrite segmentation) // Use the original feature set which doesn't contain start-and-end "wrapper" features isTokenized = props.Contains(optTokenized); tokenizerOptions = props.GetProperty(optTokenizer, null); tedEvalPrefix = props.GetProperty(optTedEval, null); hasDomainLabels = props.Contains(optWithDomains); domain = props.GetProperty(optDomain, "atb"); noRewrites = props.Contains(optNoRewrites); tf = GetTokenizerFactory(); prefixMarker = props.GetProperty(optPrefix, string.Empty); suffixMarker = props.GetProperty(optSuffix, string.Empty); if (props.Contains(optLocalFeaturesOnly)) { if (props.Contains(optFeatureFactory)) { throw new Exception("Cannot use custom feature factory with localFeaturesOnly flag--" + "have your custom feature factory extend ArabicSegmenterFeatureFactory instead of " + "StartAndEndArabicSegmenterFeatureFactory and remove the localFeaturesOnly flag." ); } props.SetProperty(optFeatureFactory, localOnlyFeatureFactory); } if (!props.Contains(optFeatureFactory)) { props.SetProperty(optFeatureFactory, defaultFeatureFactory); } // Remove all command-line properties that are specific to ArabicSegmenter props.Remove(optTokenizer); props.Remove(optTokenized); props.Remove(optPrefix); props.Remove(optSuffix); props.Remove(optThreads); props.Remove(optTedEval); props.Remove(optWithDomains); props.Remove(optDomain); props.Remove(optNoRewrites); props.Remove(optLocalFeaturesOnly); flags = new SeqClassifierFlags(props); classifier = new CRFClassifier <CoreLabel>(flags); }
private void Rectangle_MouseLeftButtonDown_2(object sender, MouseButtonEventArgs e) { var jarRoot = @"D:\stanford-ner-2018-10-16"; var classifiersDirecrory = jarRoot + @"\classifiers"; // Loading 3 class classifier model var classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz"); conn.Open(); cmd = new OleDbCommand("SELECT * From articles", conn); reader = cmd.ExecuteReader(); while (reader.Read()) { var s1 = reader["TNote"].ToString(); var s2 = classifier.classifyWithInlineXML(s1); List <string> words = s2.Split(' ', ',', '<', '>').ToList(); List <string> person = new List <string>(); int count = 0; int count1 = 0; bool isTagPerson = false; foreach (var word in words) { if (word == "/PERSON") { isTagPerson = false; count1 += 1; } if (isTagPerson) { person[count1] = person[count1] + word + " "; } if (word == "PERSON") { isTagPerson = true; person.Add(""); } count += 1; } for (int i = 0; i < count1; i++) { OleDbCommand cmd2 = new OleDbCommand("Insert Into AllNames(PName) Values(@PN)", conn); cmd2.Parameters.AddWithValue("PN", person[i]); cmd2.ExecuteNonQuery(); } } conn.Close(); }
public CRFClassifierEvaluator(string description, CRFClassifier <IN> classifier, ICollection <IList <IN> > data, IList <Triple <int[][][], int[], double[][][]> > featurizedData) { // TODO: Use data structure to hold data + features // Cache already featurized documents // Original object bank // Featurized data this.description = description; this.classifier = classifier; this.data = data; this.featurizedData = featurizedData; cmd = GetCmd(cmdStr); saveOutput = true; }
public ChineseSegmenterAnnotator(string name, Properties props) { string model = null; // Keep only the properties that apply to this annotator Properties modelProps = new Properties(); string desiredKey = name + '.'; foreach (string key in props.StringPropertyNames()) { if (key.StartsWith(desiredKey)) { // skip past name and the subsequent "." string modelKey = Sharpen.Runtime.Substring(key, desiredKey.Length); if (modelKey.Equals("model")) { model = props.GetProperty(key); } else { modelProps.SetProperty(modelKey, props.GetProperty(key)); } } } this.Verbose = PropertiesUtils.GetBool(props, name + ".verbose", false); this.normalizeSpace = PropertiesUtils.GetBool(props, name + ".normalizeSpace", false); if (model == null) { throw new Exception("Expected a property " + name + ".model"); } // don't write very much, because the CRFClassifier already reports loading if (Verbose) { log.Info("Loading Segmentation Model ... "); } try { segmenter = CRFClassifier.GetClassifier(model, modelProps); } catch (Exception e) { throw; } catch (Exception e) { throw new Exception(e); } // If newlines are treated as sentence split, we need to retain them in tokenization for ssplit to make use of them tokenizeNewline = (!props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("never")) || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); // record whether or not sentence splitting on two newlines ; if so, need to remove single newlines sentenceSplitOnTwoNewlines = props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("two"); }
private static void OutputHighlighting(PrintWriter @out, CRFClassifier <ICoreMap> classifier, string input) { ICollection <string> labels = classifier.Labels(); string background = classifier.BackgroundSymbol(); IList <IList <ICoreMap> > sentences = classifier.Classify(input); IDictionary <string, Color> tagToColorMap = NERGUI.MakeTagToColorMap(labels, background); StringBuilder result = new StringBuilder(); int lastEndOffset = 0; foreach (IList <ICoreMap> sentence in sentences) { foreach (ICoreMap word in sentence) { int beginOffset = word.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int endOffset = word.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); string answer = word.Get(typeof(CoreAnnotations.AnswerAnnotation)); if (beginOffset > lastEndOffset) { result.Append(StringEscapeUtils.EscapeHtml4(Sharpen.Runtime.Substring(input, lastEndOffset, beginOffset))); } // Add a color bar for any tagged words if (!background.Equals(answer)) { Color color = tagToColorMap[answer]; result.Append("<span style=\"color:#ffffff;background:" + NERGUI.ColorToHTML(color) + "\">"); } result.Append(StringEscapeUtils.EscapeHtml4(Sharpen.Runtime.Substring(input, beginOffset, endOffset))); // Turn off the color bar if (!background.Equals(answer)) { result.Append("</span>"); } lastEndOffset = endOffset; } } if (lastEndOffset < input.Length) { result.Append(StringEscapeUtils.EscapeHtml4(Sharpen.Runtime.Substring(input, lastEndOffset))); } result.Append("<br><br>"); result.Append("Potential tags:"); foreach (KeyValuePair <string, Color> stringColorEntry in tagToColorMap) { result.Append("<br> "); Color color = stringColorEntry.Value; result.Append("<span style=\"color:#ffffff;background:" + NERGUI.ColorToHTML(color) + "\">"); result.Append(StringEscapeUtils.EscapeHtml4(stringColorEntry.Key)); result.Append("</span>"); } @out.Print(result); }
/// <summary>Copy constructor.</summary> /// <param name="other"/> public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other) { isTokenized = other.isTokenized; tokenizerOptions = other.tokenizerOptions; prefixMarker = other.prefixMarker; suffixMarker = other.suffixMarker; tedEvalPrefix = other.tedEvalPrefix; hasDomainLabels = other.hasDomainLabels; domain = other.domain; noRewrites = other.noRewrites; flags = other.flags; // ArabicTokenizerFactory is *not* threadsafe. Make a new copy. tf = GetTokenizerFactory(); // CRFClassifier is threadsafe, so return a reference. classifier = other.classifier; }
private static string GetNLPResults(string story) { string results; // Path to the folder with classifiers models string baseDirectory = AppDomain.CurrentDomain.BaseDirectory; string classifiersDirectory = baseDirectory + @"..\DirectSupply.Anonymize.Service\Models\NLP"; // Loading 3 class classifier model CRFClassifier classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirectory + @"\english.all.3class.distsim.crf.ser.gz"); results = classifier.classifyWithInlineXML(story); return(results); }
static void Main(string[] args) { var propPath = @"..\..\train.prop"; var modelPath = @"..\..\ner-model.ser.gz"; TrainAndWrite(propPath, modelPath); var crf = CRFClassifier.getClassifierNoExceptions(modelPath); String[] tests = new String[] { "apple watch", "samsung mobile phones", " lcd 52 inch tv" }; foreach (String item in tests) { DoTagging(crf, item); } }
public void createModelFromTrainingData(string inputPath, string outputPath, string properties) { Properties props = edu.stanford.nlp.util.StringUtils.propFileToProperties(properties); props.setProperty("serializeTo", outputPath); if (inputPath != null) { props.setProperty("trainFile", inputPath); } SeqClassifierFlags flags = new SeqClassifierFlags(props); CRFClassifier crf = new CRFClassifier(flags); crf.train(); crf.serializeClassifier(outputPath); }
private async Task Init() { // Path to the folder with classifiers models var jarRoot = @"C:\stanford-ner-2018-10-16"; var classifiersDirecrory = jarRoot + @"\classifiers"; // Loading 3 class classifier model _classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz"); // Define a regular expression for finding the location element _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>", RegexOptions.Compiled | RegexOptions.IgnoreCase); // Define configurations for parsing artist and listener info var configArtistInfoJson = @" { 'artist': '//h1[contains(@class, \'view-header\')]', 'about': '//div[contains(@class, \'bio-primary\')]', 'more': '//div[contains(@class, \'bio-secondary\')]', 'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]', 'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]' }"; ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson); _artistScraping = new StructuredDataExtractor(configArtist); // Get the hosted feature layers for editing ArcGISPortal portal = await ArcGISPortal.CreateAsync(); PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId); PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId); PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId); _hometownTable = new ServiceFeatureTable(hometownLayerItem, 0); _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0); _listenerTable = new ServiceFeatureTable(listenerLayerItem, 0); await _hometownTable.LoadAsync(); await _otherPointsTable.LoadAsync(); await _listenerTable.LoadAsync(); }
public static List <(string, string)> ClassifyWordsWithTypes(this CRFClassifier classifier, string sentences) { List <(string, string)> results = new List <(string, string)>(); string xmlResults = classifier.classifyWithInlineXML(sentences); foreach (Match match in Regex.Matches(xmlResults, @"<(?<tag>[^<>]*)>(?<word>[^<>]*)<\/[^<>]*>")) { string tag = match.Groups["tag"].Value; string word = match.Groups["word"].Value; if (!string.IsNullOrWhiteSpace(tag)) { results.Add((word, tag)); } } return(results); }
/// <summary> /// Releases unmanaged and - optionally - managed resources /// </summary> /// <param name="disposing"><c>true</c> to release both managed and unmanaged resources; <c>false</c> to release only unmanaged resources.</param> /// <remarks> /// If the main class was marked as sealed, we could just make this a private void Dispose(bool). Alternatively, we could (in this case) put /// all of our logic directly in Dispose(). /// </remarks> public virtual void Dispose(bool disposing) { // Use our disposed flag to allow us to call this method multiple times safely. // This is a requirement when implementing IDisposable if (!this.disposed) { if (disposing) { // If we have any managed, IDisposable resources, Dispose of them here. // In this case, we don't, so this was unneeded. // Later, we will subclass this class, and use this section. Classifier = null; } } // Mark us as disposed, to prevent multiple calls to dispose from having an effect, // and to allow us to handle ObjectDisposedException this.disposed = true; }
static void Main(string[] args) { if (args.Length != 1) { System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename"); return; } var props = new Properties(); props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data"); // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8"); // props.setProperty("normTableEncoding", "UTF-8"); // below is needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\dict-chris6.ser.gz"); props.setProperty("testFile", args[0]); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\ctb.gz", props); segmenter.classifyAndWriteAnswers(args[0]); }