Example #1
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="input"></param>
        /// <param name="Mode"></param>
        /// <param name="defaultUserDict">致敬习大大用</param>
        public JieBaTokenizer(TextReader input, TokenizerMode Mode, bool defaultUserDict = false)
            : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input)
        {
            _segmenter = new JiebaSegmenter();
            _mode      = Mode;
            if (defaultUserDict)
            {
                _segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), _dictPath);
            }

            if (!string.IsNullOrEmpty(Settings.IgnoreDictFile))
            {
                var list = FileExtension.ReadAllLines(Settings.IgnoreDictFile);
                foreach (var item in list)
                {
                    if (string.IsNullOrEmpty(item))
                    {
                        continue;
                    }
                    if (StopWords.Contains(item))
                    {
                        continue;
                    }
                    StopWords.Add(item);
                }
            }

            if (!string.IsNullOrEmpty(Settings.UserDictFile))
            {
                _segmenter.LoadUserDict(Settings.UserDictFile);
            }

            Init();
        }
Example #2
0
        private SyntaxToken TryUpdateMode(SyntaxToken lastToken)
        {
            if (_mode == TokenizerMode.SingleLineComment && lastToken.TokenType == SyntaxTokenType.NewlineSymbol)
            {
                _mode = TokenizerMode.Content;
            }
            else if (_mode == TokenizerMode.FourCC && lastToken.TokenType == SyntaxTokenType.SingleQuote)
            {
                _mode = TokenizerMode.Content;
            }
            else if (_mode == TokenizerMode.String && lastToken.TokenType == SyntaxTokenType.DoubleQuotes)
            {
                _mode = TokenizerMode.Content;
            }
            else if (lastToken.TokenType == SyntaxTokenType.DoubleQuotes)
            {
                _mode = TokenizerMode.String;
            }
            else if (lastToken.TokenType == SyntaxTokenType.SingleQuote)
            {
                _mode = TokenizerMode.FourCC;
            }
            else if (lastToken.TokenType == SyntaxTokenType.DoubleForwardSlash)
            {
                _mode = TokenizerMode.SingleLineComment;
            }

            return(lastToken);
        }
 public JiebaTokenizer(TextReader input, TokenizerMode mode)
     : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input)
 {
     _segment = new JiebaSegment();
     _mode    = mode;
     LoadStopWords();
     Init();
 }
Example #4
0
        public IEnumerable <Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true)
        {
            var result = new List <Token>();

            var start = 0;

            if (mode == TokenizerMode.Default)
            {
                foreach (var w in Cut(text, hmm: hmm))
                {
                    var width = w.Length;
                    result.Add(new Token(w, start, start + width));
                    start += width;
                }
            }
            else
            {
                foreach (var w in Cut(text, hmm: hmm))
                {
                    var width = w.Length;
                    if (width > 2)
                    {
                        for (var i = 0; i < width - 1; i++)
                        {
                            var gram2 = w.Substring(i, 2);
                            if (WordDict.ContainsWord(gram2))
                            {
                                result.Add(new Token(gram2, start + i, start + i + 2));
                            }
                        }
                    }

                    if (width > 3)
                    {
                        for (var i = 0; i < width - 2; i++)
                        {
                            var gram3 = w.Substring(i, 3);
                            if (WordDict.ContainsWord(gram3))
                            {
                                result.Add(new Token(gram3, start + i, start + i + 3));
                            }
                        }
                    }

                    result.Add(new Token(w, start, start + width));
                    start += width;
                }
            }

            return(result);
        }
Example #5
0
        public ViterbiSearcher(TokenizerMode mode, ConnectionCosts costs, UnknownDictionary unknownDictionary, List <int> penalties)
        {
            if (penalties.Count != 0)
            {
                KanjiPenaltyLengthThreshold = penalties[0];
                KanjiPenalty = penalties[1];
                OtherPenaltyLengthThreshold = penalties[2];
                OtherPenalty = penalties[3];
            }

            Mode              = mode;
            Costs             = costs;
            UnknownDictionary = unknownDictionary;
            MultiSearcher     = new MultiSearcher(costs, mode, this);
        }
Example #6
0
        public JieBaTokenizer(TextReader input, TokenizerMode Mode)
            : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input)
        {
            segmenter = new JiebaSegmenter();
            mode      = Mode;
            StreamReader rd = File.OpenText(stopUrl);
            string       s  = "";

            while ((s = rd.ReadLine()) != null)
            {
                stopWords.Add(s);
            }

            Init();
        }
        public IEnumerable <Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true)
        {
            var result = new List <Token>();

            if (mode == TokenizerMode.Default)
            {
                foreach (var w in Cut2(text, hmm: hmm))
                {
                    var width = w.value.Length;
                    result.Add(new Token(w.value, w.position, w.position + width));
                }
            }
            else
            {
                //var xx = Cut2(text, hmm: hmm);
                foreach (var w in Cut2(text, hmm: hmm))
                {
                    var width = w.value.Length;
                    if (width > 2)
                    {
                        for (var i = 0; i < width - 1; i++)
                        {
                            var gram2 = w.value.Substring(i, 2);
                            if (WordDict.ContainsWord(gram2))
                            {
                                result.Add(new Token(gram2, w.position + i, w.position + i + 2));
                            }
                        }
                    }

                    if (width > 3)
                    {
                        for (var i = 0; i < width - 2; i++)
                        {
                            var gram3 = w.value.Substring(i, 3);
                            if (WordDict.ContainsWord(gram3))
                            {
                                result.Add(new Token(gram3, w.position + i, w.position + i + 3));
                            }
                        }
                    }

                    result.Add(new Token(w.value, w.position, w.position + width));
                }
            }

            return(result);
        }
Example #8
0
        internal static SyntaxTokenType GetAlphanumericalTokenType(string token, TokenizerMode mode)
        {
            if (string.IsNullOrEmpty(token))
            {
                throw new ArgumentNullException(nameof(token));
            }

            switch (mode)
            {
            case TokenizerMode.String: return(SyntaxTokenType.String);

            case TokenizerMode.FourCC: return(SyntaxTokenType.FourCCNumber);

            case TokenizerMode.SingleLineComment: return(SyntaxTokenType.Comment);

            default:
            {
                var firstChar = token[0];

                if (firstChar == '$')
                {
                    return(SyntaxTokenType.HexadecimalNumber);
                }
                if (token.Contains('.'))
                {
                    return(SyntaxTokenType.RealNumber);
                }
                if (!char.IsDigit(firstChar))
                {
                    return(SyntaxTokenType.AlphanumericIdentifier);
                }

                if (firstChar == '0')
                {
                    if ((token.Length >= 2 ? char.ToLower(token[1]) : '\0') == 'x')
                    {
                        return(SyntaxTokenType.HexadecimalNumber);
                    }
                    return(SyntaxTokenType.OctalNumber);
                }

                return(SyntaxTokenType.DecimalNumber);
            }
            }
        }
Example #9
0
 public JieBaAnalyzer(TokenizerMode Mode)
     : base()
 {
     this.mode = Mode;
 }
Example #10
0
        public IEnumerable <SyntaxToken> Tokenize()
        {
            _mode = TokenizerMode.Content;

            var buffer = new StringBuilder(BufferCapacity);

            const char EscapeCharacter            = '\\';
            var        encounteredEscapeCharacter = false;

            using (var reader = new StringReader(_text))
            {
                while (true)
                {
                    var peek = reader.Peek();

                    if (peek == EscapeCharacter)
                    {
                        if (_mode == TokenizerMode.Content)
                        {
                            throw new Exception($"Unexpectedly encountered escape character '{EscapeCharacter}'");
                        }

                        if (!encounteredEscapeCharacter)
                        {
                            encounteredEscapeCharacter = true;
                            buffer.Append((char)reader.Read());
                            continue;
                        }
                    }

                    var ignoreDelimiter = encounteredEscapeCharacter;
                    encounteredEscapeCharacter = false;

                    // TODO: support multiCharacter delimiters (eg /* */ in vJASS)
                    if (!ignoreDelimiter && IsCharacterDelimiter(peek))
                    {
                        if (buffer.Length == 0)
                        {
                            if (peek == -1)
                            {
                                break;
                            }

                            var peekChar = (char)peek;

                            if (SyntaxToken.TryTokenizeSingleSymbol(peekChar, out var token))
                            {
                                reader.Read();
                                peek = reader.Peek();

                                if (peek != -1 && SyntaxToken.TryTokenizeKeyword($"{peekChar}{(char)peek}", out var token2))
                                {
                                    reader.Read();

                                    // Assumption: non-alphanumeric tokens are at most 2 characters (==, !=, <=, >=, //).
                                    yield return(TryUpdateMode(token2));
                                }
                                else
                                {
                                    yield return(TryUpdateMode(token));
                                }

                                continue;
                            }
                            else if (char.IsWhiteSpace(peekChar))
                            {
                                reader.Read();
                                continue;
                            }
                            else
                            {
                                // Handle 2-char symbols for which the first symbol on its own is not a token.
                                reader.Read();
                                peek = reader.Read();

                                if (SyntaxToken.TryTokenizeKeyword($"{peekChar}{(char)peek}", out var token2))
                                {
                                    yield return(TryUpdateMode(token2));

                                    continue;
                                }

                                throw new Exception($"Invalid sequence of symbols: {peekChar}{(char)peek}");
                            }
                        }
                        else
                        {
                            var tokenText = buffer.ToString();
                            buffer.Length = 0;

                            if (_mode == TokenizerMode.Content && SyntaxToken.TryTokenizeKeyword(tokenText, out var token))
                            {
                                yield return(TryUpdateMode(token));
                            }
                            else
                            {
                                yield return(TryUpdateMode(new SyntaxToken(SyntaxToken.GetAlphanumericalTokenType(tokenText, _mode), tokenText)));
                            }

                            continue;
                        }
                    }

                    buffer.Append((char)reader.Read());
                }

                yield return(new SyntaxToken(SyntaxTokenType.EndOfFile));
            }
        }
Example #11
0
        public ViterbiBuilder(DoubleArrayTrie doubleArrayTrie, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, TokenizerMode mode)
        {
            DoubleArrayTrie   = doubleArrayTrie;
            Dictionary        = dictionary;
            UnknownDictionary = unknownDictionary;
            UserDictionary    = userDictionary;

            UseUserDictionary = userDictionary != null;

            SearchMode = mode == TokenizerMode.Search || mode == TokenizerMode.Extended;

            CharacterDefinitions = UnknownDictionary.CharacterDefinition;
        }
Example #12
0
        public IEnumerable<Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true)
        {
            var result = new List<Token>();

            var start = 0;
            if (mode == TokenizerMode.Default)
            {
                foreach (var w in Cut(text, hmm: hmm))
                {
                    var width = w.Length;
                    result.Add(new Token(w, start, start + width));
                    start += width;
                }
            }
            else
            {
                foreach (var w in Cut(text, hmm: hmm))
                {
                    var width = w.Length;
                    if (width > 2)
                    {
                        for (var i = 0; i < width - 1; i++)
                        {
                            var gram2 = w.Substring(i, 2);
                            if (WordDict.ContainsWord(gram2))
                            {
                                result.Add(new Token(gram2, start + i, start + i + 2));
                            }
                        }
                    }
                    if (width > 3)
                    {
                        for (var i = 0; i < width - 2; i++)
                        {
                            var gram3 = w.Substring(i, 3);
                            if (WordDict.ContainsWord(gram3))
                            {
                                result.Add(new Token(gram3, start + i, start + i + 3));
                            }
                        }
                    }

                    result.Add(new Token(w, start, start + width));
                    start += width;
                }
            }

            return result;
        }
Example #13
0
 public RouteTokenizer()
 {
     _mode = TokenizerMode.Default;
 }
Example #14
0
 private void SetTokenizerMode(TokenizerMode mode)
 {
     if (mode != this._tokenizer.Mode && this._ungotToken != null && !this._ungotToken.Kind.HasTrait(TokenFlags.ParseModeInvariant))
     {
         this.Resync(this._ungotToken);
     }
     this._tokenizer.Mode = mode;
 }
Example #15
0
        private void SetTokenizerMode(TokenizerMode mode)
        {
            if (mode != _tokenizer.Mode && _ungotToken != null)
            {
                // Only rescan tokens that differ b/w command and expression modes.
                if (!_ungotToken.Kind.HasTrait(TokenFlags.ParseModeInvariant))
                {
                    Resync(_ungotToken);
                }
#if DEBUG
                else if (_ungotToken.Kind != TokenKind.EndOfInput)
                {
                    // Verify the comment above.
                    Token ungotToken = _ungotToken;
                    var oldTokenList = _tokenizer.TokenList;
                    _tokenizer.TokenList = null;
                    Resync(_ungotToken);
                    _tokenizer.Mode = mode;
                    Token rescan = _tokenizer.NextToken();
                    Diagnostics.Assert(ungotToken.Kind == rescan.Kind, "Rescan failed to return same kind");
                    Diagnostics.Assert(ungotToken.Text == rescan.Text, "Rescan failed to return same text");
                    IScriptPosition pos1 = ungotToken.Extent.StartScriptPosition;
                    IScriptPosition pos2 = rescan.Extent.StartScriptPosition;
                    Diagnostics.Assert(pos1.ColumnNumber == pos2.ColumnNumber, "Rescan failed to return same start column");
                    Diagnostics.Assert(pos1.LineNumber == pos2.LineNumber, "Rescan failed to return same start line#");
                    pos1 = ungotToken.Extent.EndScriptPosition;
                    pos2 = rescan.Extent.EndScriptPosition;
                    Diagnostics.Assert(pos1.ColumnNumber == pos2.ColumnNumber, "Rescan failed to return same end column");
                    Diagnostics.Assert(pos1.LineNumber == pos2.LineNumber, "Rescan failed to return same end line#");
                    // Make sure we leave things as they were - Resync clears _ungotToken.
                    _ungotToken = ungotToken;
                    _tokenizer.TokenList = oldTokenList;
                }
#endif
            }
            _tokenizer.Mode = mode;
        }
Example #16
0
 /// <summary>
 ///
 /// </summary>
 /// <param name="Mode">只有两种模式,一般来说写入用Search,查询用默认</param>
 public Analyzer1(TokenizerMode Mode) : base()
 {
     this.Mode = Mode;
 }
Example #17
0
        public RouteToken[] Tokenize(string route)
        {
            if (route.Length < 1)
                throw new RouteParsingException(route, 0, "The route is empty.");

            var reader = new StringReader(route);
            var token = new List<RouteToken>();

            int startPos = -1;
            char current, next;
            int position = -1;
            while (reader.Peek() != -1)
            {
                position++;
                current = (char)reader.Read();
                next = (char)reader.Peek();

                switch (_mode)
                {
                    case TokenizerMode.Default:
                        if (current == '/')
                        {
                            token.Add(new SeparatorToken(position));
                            continue;
                        }

                        if (current == '{')
                        {
                            token.Add(new StartParameterToken(position));
                            continue;
                        }

                        if (current == '}')
                        {
                            token.Add(new EndParameterToken(position));
                            continue;
                        }

                        if (current == '*')
                        {
                            token.Add(new WildcardToken(position));
                            continue;
                        }

                        if (char.IsDigit(current))
                        {
                            if (!char.IsDigit(next))
                            {
                                token.Add(new NumberToken(ParseNumber(route, position, 1), position, 1));
                                continue;
                            }

                            _mode = TokenizerMode.Number;
                            startPos = position;
                            continue;
                        }

                        if (char.IsLetter(current))
                        {
                            if (!char.IsLetterOrDigit(next))
                            {
                                token.Add(new NameToken(current.ToString(), position, 1));
                                continue;
                            }

                            _mode = TokenizerMode.Name;
                            startPos = position;
                            continue;
                        }

                        throw new RouteParsingException(route, position, "Unexpected character {0}.", current);

                    case TokenizerMode.Number:
                        if (!char.IsDigit(next))
                        {
                            token.Add(new NumberToken(ParseNumber(route, startPos, position - startPos + 1), startPos, position - startPos + 1));
                            _mode = TokenizerMode.Default;
                        }
                        continue;

                    case TokenizerMode.Name:
                        if (!char.IsLetterOrDigit(next))
                        {
                            token.Add(new NameToken(route.Substring(startPos, position - startPos + 1), startPos, position - startPos + 1));
                            _mode = TokenizerMode.Default;
                        }
                        continue;
                }
            }

            return token.ToArray();
        }
Example #18
0
 public IEnumerable <Segmenter.Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search)
 {
     return(segmenter.Tokenize(text, mode));
 }
Example #19
0
 public JieBaAnalyzer(TokenizerMode Mode, bool defaultUserDict = false) : base()
 {
     _mode            = Mode;
     _defaultUserDict = defaultUserDict;
 }
Example #20
0
 public IEnumerable<JiebaNet.Segmenter.Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search)
 {
     return segmenter.Tokenize(text, mode);
 }
Example #21
0
 public MultiSearcher(ConnectionCosts costs, TokenizerMode mode, ViterbiSearcher viterbiSearcher)
 {
     Costs           = costs;
     Mode            = mode;
     ViterbiSearcher = viterbiSearcher;
 }
Example #22
0
 public JiebaAnalyzer(TokenizerMode mode)
 {
     _mode = mode;
 }
Example #23
0
 /// <summary>
 ///
 /// </summary>
 /// <param name="textReader"></param>
 /// <param name="Mode"></param>
 public Tokenizer1(TextReader textReader, TokenizerMode Mode) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, textReader)
 {
 }