コード例 #1
0
        public virtual void TestCharOffsets()
        {
            string untokInput = "إِنَّ- -نا هادِئ+ُونَ .";

            int[] beginOffsets = new int[] { 0, 7, 11, 22 };
            int[] endOffsets   = new int[] { 6, 10, 21, 23 };
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput));
            IList <CoreLabel>      tokens    = tokenizer.Tokenize();

            NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length);
            for (int i = 0; i < beginOffsets.Length; i++)
            {
                NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition());
                NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition());
            }
        }
コード例 #2
0
        /// <summary>
        /// arg[0] := tokenizer options
        /// args[1] := file to tokenize
        /// </summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName);
                System.Environment.Exit(-1);
            }
            string tokOptions = args[0];
            File   path       = new File(args[1]);

            log.Info("Reading from: " + path.GetPath());
            try
            {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
                ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory();
                tf.SetOptions(tokOptions);
                IMapper lexMapper = new DefaultLexicalMapper();
                lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
                int lineId = 0;
                for (string line; (line = br.ReadLine()) != null; lineId++)
                {
                    line = line.Trim();
                    // Tokenize with the tokenizer
                    IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize();
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine));
                    // Tokenize with the mapper
                    StringBuilder sb   = new StringBuilder();
                    string[]      toks = line.Split("\\s+");
                    foreach (string tok in toks)
                    {
                        string mappedTok = lexMapper.Map(null, tok);
                        sb.Append(mappedTok).Append(" ");
                    }
                    IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+"));
                    // Evaluate the output
                    if (mappedToks.Count != tokenizedLine.Count)
                    {
                        System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                    }
                    else
                    {
                        bool printLines = false;
                        for (int i = 0; i < mappedToks.Count; ++i)
                        {
                            string mappedTok    = mappedToks[i];
                            string tokenizedTok = tokenizedLine[i].Word();
                            if (!mappedTok.Equals(tokenizedTok))
                            {
                                System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok);
                                printLines = true;
                            }
                        }
                        if (printLines)
                        {
                            System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                        }
                    }
                }
                System.Console.Error.Printf("Read %d lines.%n", lineId);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
コード例 #3
0
        /// <summary>For debugging.</summary>
        /// <param name="args"/>
        /// <exception cref="System.IO.IOException"></exception>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file > output%n", typeof(ArabicDocumentReaderAndWriter).FullName);
                System.Environment.Exit(-1);
            }
            string fileName = args[0];
            ITokenizerFactory <CoreLabel> tokFactory = ArabicTokenizer.AtbFactory();
            string atbVocOptions = "removeProMarker,removeMorphMarker";

            tokFactory.SetOptions(atbVocOptions);
            BufferedReader reader = IOUtils.ReaderFromString(fileName);

            for (string line; (line = reader.ReadLine()) != null;)
            {
                string[] toks    = line.Split("\\s+");
                string   delim   = Pattern.Quote(tagDelimiter);
                bool     isStart = true;
                foreach (string wordTag in toks)
                {
                    string[] wordTagPair = wordTag.Split(delim);
                    System.Diagnostics.Debug.Assert(wordTagPair.Length == 2);
                    string word = wordTagPair[0];
                    if (tokFactory != null)
                    {
                        IList <CoreLabel> lexList = tokFactory.GetTokenizer(new StringReader(word)).Tokenize();
                        if (lexList.Count == 0)
                        {
                            continue;
                        }
                        else
                        {
                            if (lexList.Count == 1)
                            {
                                word = lexList[0].Value();
                            }
                            else
                            {
                                if (lexList.Count > 1)
                                {
                                    string secondWord = lexList[1].Value();
                                    if (secondWord.Equals(DefaultSegMarker.ToString()))
                                    {
                                        // Special case for the null marker in the vocalized section
                                        word = lexList[0].Value() + DefaultSegMarker.ToString();
                                    }
                                    else
                                    {
                                        System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", typeof(ArabicDocumentReaderAndWriter).FullName, word);
                                        word = lexList[0].Value();
                                    }
                                }
                            }
                        }
                    }
                    if (!isStart)
                    {
                        System.Console.Out.Write(" ");
                    }
                    System.Console.Out.Write(word);
                    isStart = false;
                }
                System.Console.Out.WriteLine();
            }
        }
コード例 #4
0
        /// <summary>A fast, rule-based tokenizer for Spanish based on AnCora.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Spanish based on AnCora.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </p>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.CoreLabelFactory();
            string orthoOptions = options.Contains("ancora") ? AncoraOptions : string.Empty;

            if (options.Contains("options"))
            {
                orthoOptions = orthoOptions.IsEmpty() ? options.GetProperty("options") : orthoOptions + ',' + options;
            }
            bool tokens = PropertiesUtils.GetBool(options, "tokens", false);

            if (!tokens)
            {
                orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            }
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding   = options.GetProperty("encoding", "UTF-8");
            bool   toLower    = PropertiesUtils.GetBool(options, "lowerCase", false);
            Locale es         = new Locale("es");
            bool   onePerLine = PropertiesUtils.GetBool(options, "onePerLine", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new BufferedReader(new InputStreamReader(Runtime.@in, encoding)));
                BufferedWriter         writer    = new BufferedWriter(new OutputStreamWriter(System.Console.Out, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(SpanishLexer.NewlineToken))
                    {
                        ++nLines;
                        if (!onePerLine)
                        {
                            writer.NewLine();
                            printSpace = false;
                        }
                    }
                    else
                    {
                        string outputToken = toLower ? word.ToLower(es) : word;
                        if (onePerLine)
                        {
                            writer.Write(outputToken);
                            writer.NewLine();
                        }
                        else
                        {
                            if (printSpace)
                            {
                                writer.Write(" ");
                            }
                            writer.Write(outputToken);
                            printSpace = true;
                        }
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                throw new RuntimeIOException("Bad character encoding", e);
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }