Пример #1
0
        static void Main()
        {
            // Path to the folder with models
            var segmenterData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\data";
            var sampleData    = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\test.simp.utf8";

            // `test.simple.utf8` contains following text:
            // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫
            // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();

            props.setProperty("sighanCorporaDict", segmenterData);
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(sampleData);
        }
Пример #2
0
        static void Main()
        {
            // Path to the folder with models
            var segmenterData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\data";
            var sampleData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\test.simp.utf8";

            // `test.simple.utf8` contains following text:
            // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫
            // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();
            props.setProperty("sighanCorporaDict", segmenterData);
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);
            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(sampleData);
        }
        public void ChineseWordSegmenter()
        {
            var sampleData = Files.Segmenter.Data("../test.simp.utf8");

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();

            props.setProperty("sighanCorporaDict", Files.Segmenter.Root);
            props.setProperty("NormalizationTable", Files.Segmenter.Data("norm.simp.utf8"));
            props.setProperty("normTableEncoding", "UTF-8");
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", Files.Segmenter.Data("dict-chris6.ser.gz"));
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(Files.Segmenter.Data(@"ctb.gz"), props);
            segmenter.classifyAndWriteAnswers(sampleData);

            var sample    = "2008年我住在美国。";
            var segmented = segmenter.segmentString(sample);

            Console.WriteLine(segmented);
        }
Пример #4
0
        static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename");
                return;
            }

            var props = new Properties();
            props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data");
            // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8");
            // props.setProperty("normTableEncoding", "UTF-8");
            // below is needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\dict-chris6.ser.gz");
            props.setProperty("testFile", args[0]);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            var segmenter = new CRFClassifier(props);
            segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(args[0]);
        }
Пример #5
0
        static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename");
                return;
            }

            var props = new Properties();

            props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data");
            // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8");
            // props.setProperty("normTableEncoding", "UTF-8");
            // below is needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\dict-chris6.ser.gz");
            props.setProperty("testFile", args[0]);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(args[0]);
        }