static void Main() { // Path to the folder with models var segmenterData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\data"; var sampleData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\test.simp.utf8"; // `test.simple.utf8` contains following text: // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫 // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。 // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); segmenter.classifyAndWriteAnswers(sampleData); }
static void Main() { // Path to the folder with models var segmenterData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\data"; var sampleData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\test.simp.utf8"; // `test.simple.utf8` contains following text: // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫 // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。 // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); segmenter.classifyAndWriteAnswers(sampleData); }
public void ChineseWordSegmenter() { var sampleData = Files.Segmenter.Data("../test.simp.utf8"); // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", Files.Segmenter.Root); props.setProperty("NormalizationTable", Files.Segmenter.Data("norm.simp.utf8")); props.setProperty("normTableEncoding", "UTF-8"); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", Files.Segmenter.Data("dict-chris6.ser.gz")); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(Files.Segmenter.Data(@"ctb.gz"), props); segmenter.classifyAndWriteAnswers(sampleData); var sample = "2008年我住在美国。"; var segmented = segmenter.segmentString(sample); Console.WriteLine(segmented); }
static void Main(string[] args) { if (args.Length != 1) { System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename"); return; } var props = new Properties(); props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data"); // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8"); // props.setProperty("normTableEncoding", "UTF-8"); // below is needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\dict-chris6.ser.gz"); props.setProperty("testFile", args[0]); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\ctb.gz", props); segmenter.classifyAndWriteAnswers(args[0]); }
static void Main(string[] args) { if (args.Length != 1) { System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename"); return; } var props = new Properties(); props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data"); // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8"); // props.setProperty("normTableEncoding", "UTF-8"); // below is needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\dict-chris6.ser.gz"); props.setProperty("testFile", args[0]); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\ctb.gz", props); segmenter.classifyAndWriteAnswers(args[0]); }