Esempio n. 1
0
        static void Main()
        {
            // Path to the folder with models
            var segmenterData = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\data";
            var sampleData    = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-segmenter-2018-02-27\test.simp.utf8";

            // `test.simple.utf8` contains following text:
            // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫
            // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();

            props.setProperty("sighanCorporaDict", segmenterData);
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(sampleData);
        }
Esempio n. 2
0
        static void Main()
        {
            // Path to the folder with models
            var segmenterData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\data";
            var sampleData = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-segmenter-2015-12-09\test.simp.utf8";

            // `test.simple.utf8` contains following text:
            // 面对新世纪,世界各国人民的共同愿望是:继续发展人类以往创造的一切文明成果,克服20世纪困扰着人类的战争和贫
            // 困问题,推进和平与发展的崇高事业,创造一个美好的世界。

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();
            props.setProperty("sighanCorporaDict", segmenterData);
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);
            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(sampleData);
        }
        public void ChineseWordSegmenter()
        {
            var sampleData = Files.Segmenter.Data("../test.simp.utf8");

            // This is a very simple demo of calling the Chinese Word Segmenter programmatically.
            // It assumes an input file in UTF8. This will run correctly in the distribution home
            // directory. To run in general, the properties for where to find dictionaries or
            // normalizations have to be set.
            // @author Christopher Manning

            // Setup Segmenter loading properties
            var props = new Properties();

            props.setProperty("sighanCorporaDict", Files.Segmenter.Root);
            props.setProperty("NormalizationTable", Files.Segmenter.Data("norm.simp.utf8"));
            props.setProperty("normTableEncoding", "UTF-8");
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", Files.Segmenter.Data("dict-chris6.ser.gz"));
            props.setProperty("testFile", sampleData);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            // Load Word Segmenter
            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(Files.Segmenter.Data(@"ctb.gz"), props);
            segmenter.classifyAndWriteAnswers(sampleData);

            var sample    = "2008年我住在美国。";
            var segmented = segmenter.segmentString(sample);

            Console.WriteLine(segmented);
        }
Esempio n. 4
0
        static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename");
                return;
            }

            var props = new Properties();
            props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data");
            // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8");
            // props.setProperty("normTableEncoding", "UTF-8");
            // below is needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\dict-chris6.ser.gz");
            props.setProperty("testFile", args[0]);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            var segmenter = new CRFClassifier(props);
            segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(args[0]);
        }
Esempio n. 5
0
        static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.WriteLine("usage: StanfordSegmenter.Csharp.Samples.exe filename");
                return;
            }

            var props = new Properties();

            props.setProperty("sighanCorporaDict", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data");
            // props.setProperty("NormalizationTable", @"..\..\..\..\temp\stanford-segmenter-2013-06-20\data\norm.simp.utf8");
            // props.setProperty("normTableEncoding", "UTF-8");
            // below is needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("serDictionary", @"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\dict-chris6.ser.gz");
            props.setProperty("testFile", args[0]);
            props.setProperty("inputEncoding", "UTF-8");
            props.setProperty("sighanPostProcessing", "true");

            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(@"..\..\..\..\temp\stanford-segmenter-2013-11-12\data\ctb.gz", props);
            segmenter.classifyAndWriteAnswers(args[0]);
        }
Esempio n. 6
0
        private void BgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });


            //report what we're working on
            FilenameLabel.Invoke((MethodInvoker) delegate
            {
                FilenameLabel.Text = "Loading model... please wait...";
            });

            // Path to the folder with models
            var segmenterData = Path.Combine(Path.GetDirectoryName(AppDomain.CurrentDomain.BaseDirectory), @"data");

            var props = new Properties();

            props.setProperty("sighanCorporaDict", segmenterData);
            props.setProperty("serDictionary", segmenterData + @"\dict-chris6.ser.gz");
            // Lines below are needed because CTBSegDocumentIteratorFactory accesses it
            props.setProperty("inputEncoding", SelectedEncoding.ToString());
            props.setProperty("sighanPostProcessing", "true");

            var segmenter = new CRFClassifier(props);

            segmenter.loadClassifierNoExceptions(segmenterData + @"\ctb.gz", props);


            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth);



            try {
                string outputdir = Path.Combine(((string[])e.Argument)[1]);

                Directory.CreateDirectory(outputdir);



                foreach (string fileName in files)
                {
                    //set up our variables to report
                    string Filename_Clean = Path.GetFileName(fileName);



                    //report what we're working on
                    FilenameLabel.Invoke((MethodInvoker) delegate
                    {
                        FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                    });



                    //do stuff here
                    string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower();

                    string TokenResults = segmenter.classifyToString(readText);


                    using (System.IO.StreamWriter fileout =
                               new StreamWriter(Path.Combine(outputdir, Filename_Clean), false, SelectedEncoding))
                    {
                        fileout.Write(TokenResults);
                    }
                }
            }
            catch
            {
                MessageBox.Show("ZhToken encountered a problem while trying to tokenize/write a file.");
            }
        }