static void Main() { // Path to the folder with models var segmenterData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\data"; var sampleData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\test.simp.utf8"; // `test.simple.utf8` contains following text: // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫 // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。 // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); segmenter.classifyAndWriteAnswers(sampleData); }
static void Main() { // Path to the folder with models var segmenterData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\data"; var sampleData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\test.simp.utf8"; // `test.simple.utf8` contains following text: // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫 // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。 // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); segmenter.classifyAndWriteAnswers(sampleData); }
public void ChineseWordSegmenter() { var sampleData = Files.Segmenter.Data("../test.simp.utf8"); // This is a very simple demo of calling the Chinese Word Segmenter programmatically. // It assumes an input file in UTF8. This will run correctly in the distribution home // directory. To run in general, the properties for where to find dictionaries or // normalizations have to be set. // @author Christopher Manning // Setup Segmenter loading properties var props = new Properties(); props.setProperty("sighanCorporaDict", Files.Segmenter.Root); props.setProperty("NormalizationTable", Files.Segmenter.Data("norm.simp.utf8")); props.setProperty("normTableEncoding", "UTF-8"); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", Files.Segmenter.Data("dict-chris6.ser.gz")); props.setProperty("testFile", sampleData); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); // Load Word Segmenter var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(Files.Segmenter.Data(@"ctb.gz"), props); segmenter.classifyAndWriteAnswers(sampleData); var sample = "2008年我住在美国。"; var segmented = segmenter.segmentString(sample); Console.WriteLine(segmented); }
static void Main(string[] args) { if (args.Length != 1) { System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename"); return; } var props = new Properties(); props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data"); // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8"); // props.setProperty("normTableEncoding", "UTF-8"); // below is needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\dict-chris6.ser.gz"); props.setProperty("testFile", args[0]); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\ctb.gz", props); segmenter.classifyAndWriteAnswers(args[0]); }
static void Main(string[] args) { if (args.Length != 1) { System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename"); return; } var props = new Properties(); props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data"); // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8"); // props.setProperty("normTableEncoding", "UTF-8"); // below is needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\dict-chris6.ser.gz"); props.setProperty("testFile", args[0]); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\ctb.gz", props); segmenter.classifyAndWriteAnswers(args[0]); }
private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Loading model... please wait..."; }); // Path to the folder with models var segmenterData = Path.Combine(Path.GetDirectoryName(AppDomain.CurrentDomain.BaseDirectory), @"data"); var props = new Properties(); props.setProperty("sighanCorporaDict", segmenterData); props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz"); // Lines below are needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("inputEncoding", SelectedEncoding.ToString()); props.setProperty("sighanPostProcessing", "true"); var segmenter = new CRFClassifier(props); segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth); try { string outputdir = Path.Combine(((string[])e.Argument)[1]); Directory.CreateDirectory(outputdir); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //do stuff here string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower(); string TokenResults = segmenter.classifyToString(readText); using (System.IO.StreamWriter fileout = new StreamWriter(Path.Combine(outputdir, Filename_Clean), false, SelectedEncoding)) { fileout.Write(TokenResults); } } } catch { MessageBox.Show("ZhToken encountered a problem while trying to tokenize/write a file."); } }