/// <summary> /// /// </summary> /// <param name="input"></param> /// <param name="Mode"></param> /// <param name="defaultUserDict">致敬习大大用</param> public JieBaTokenizer(TextReader input, TokenizerMode Mode, bool defaultUserDict = false) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) { _segmenter = new JiebaSegmenter(); _mode = Mode; if (defaultUserDict) { _segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), _dictPath); } if (!string.IsNullOrEmpty(Settings.IgnoreDictFile)) { var list = FileExtension.ReadAllLines(Settings.IgnoreDictFile); foreach (var item in list) { if (string.IsNullOrEmpty(item)) { continue; } if (StopWords.Contains(item)) { continue; } StopWords.Add(item); } } if (!string.IsNullOrEmpty(Settings.UserDictFile)) { _segmenter.LoadUserDict(Settings.UserDictFile); } Init(); }
private SyntaxToken TryUpdateMode(SyntaxToken lastToken) { if (_mode == TokenizerMode.SingleLineComment && lastToken.TokenType == SyntaxTokenType.NewlineSymbol) { _mode = TokenizerMode.Content; } else if (_mode == TokenizerMode.FourCC && lastToken.TokenType == SyntaxTokenType.SingleQuote) { _mode = TokenizerMode.Content; } else if (_mode == TokenizerMode.String && lastToken.TokenType == SyntaxTokenType.DoubleQuotes) { _mode = TokenizerMode.Content; } else if (lastToken.TokenType == SyntaxTokenType.DoubleQuotes) { _mode = TokenizerMode.String; } else if (lastToken.TokenType == SyntaxTokenType.SingleQuote) { _mode = TokenizerMode.FourCC; } else if (lastToken.TokenType == SyntaxTokenType.DoubleForwardSlash) { _mode = TokenizerMode.SingleLineComment; } return(lastToken); }
public JiebaTokenizer(TextReader input, TokenizerMode mode) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) { _segment = new JiebaSegment(); _mode = mode; LoadStopWords(); Init(); }
public IEnumerable <Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true) { var result = new List <Token>(); var start = 0; if (mode == TokenizerMode.Default) { foreach (var w in Cut(text, hmm: hmm)) { var width = w.Length; result.Add(new Token(w, start, start + width)); start += width; } } else { foreach (var w in Cut(text, hmm: hmm)) { var width = w.Length; if (width > 2) { for (var i = 0; i < width - 1; i++) { var gram2 = w.Substring(i, 2); if (WordDict.ContainsWord(gram2)) { result.Add(new Token(gram2, start + i, start + i + 2)); } } } if (width > 3) { for (var i = 0; i < width - 2; i++) { var gram3 = w.Substring(i, 3); if (WordDict.ContainsWord(gram3)) { result.Add(new Token(gram3, start + i, start + i + 3)); } } } result.Add(new Token(w, start, start + width)); start += width; } } return(result); }
public ViterbiSearcher(TokenizerMode mode, ConnectionCosts costs, UnknownDictionary unknownDictionary, List <int> penalties) { if (penalties.Count != 0) { KanjiPenaltyLengthThreshold = penalties[0]; KanjiPenalty = penalties[1]; OtherPenaltyLengthThreshold = penalties[2]; OtherPenalty = penalties[3]; } Mode = mode; Costs = costs; UnknownDictionary = unknownDictionary; MultiSearcher = new MultiSearcher(costs, mode, this); }
public JieBaTokenizer(TextReader input, TokenizerMode Mode) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) { segmenter = new JiebaSegmenter(); mode = Mode; StreamReader rd = File.OpenText(stopUrl); string s = ""; while ((s = rd.ReadLine()) != null) { stopWords.Add(s); } Init(); }
public IEnumerable <Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true) { var result = new List <Token>(); if (mode == TokenizerMode.Default) { foreach (var w in Cut2(text, hmm: hmm)) { var width = w.value.Length; result.Add(new Token(w.value, w.position, w.position + width)); } } else { //var xx = Cut2(text, hmm: hmm); foreach (var w in Cut2(text, hmm: hmm)) { var width = w.value.Length; if (width > 2) { for (var i = 0; i < width - 1; i++) { var gram2 = w.value.Substring(i, 2); if (WordDict.ContainsWord(gram2)) { result.Add(new Token(gram2, w.position + i, w.position + i + 2)); } } } if (width > 3) { for (var i = 0; i < width - 2; i++) { var gram3 = w.value.Substring(i, 3); if (WordDict.ContainsWord(gram3)) { result.Add(new Token(gram3, w.position + i, w.position + i + 3)); } } } result.Add(new Token(w.value, w.position, w.position + width)); } } return(result); }
internal static SyntaxTokenType GetAlphanumericalTokenType(string token, TokenizerMode mode) { if (string.IsNullOrEmpty(token)) { throw new ArgumentNullException(nameof(token)); } switch (mode) { case TokenizerMode.String: return(SyntaxTokenType.String); case TokenizerMode.FourCC: return(SyntaxTokenType.FourCCNumber); case TokenizerMode.SingleLineComment: return(SyntaxTokenType.Comment); default: { var firstChar = token[0]; if (firstChar == '$') { return(SyntaxTokenType.HexadecimalNumber); } if (token.Contains('.')) { return(SyntaxTokenType.RealNumber); } if (!char.IsDigit(firstChar)) { return(SyntaxTokenType.AlphanumericIdentifier); } if (firstChar == '0') { if ((token.Length >= 2 ? char.ToLower(token[1]) : '\0') == 'x') { return(SyntaxTokenType.HexadecimalNumber); } return(SyntaxTokenType.OctalNumber); } return(SyntaxTokenType.DecimalNumber); } } }
public JieBaAnalyzer(TokenizerMode Mode) : base() { this.mode = Mode; }
public IEnumerable <SyntaxToken> Tokenize() { _mode = TokenizerMode.Content; var buffer = new StringBuilder(BufferCapacity); const char EscapeCharacter = '\\'; var encounteredEscapeCharacter = false; using (var reader = new StringReader(_text)) { while (true) { var peek = reader.Peek(); if (peek == EscapeCharacter) { if (_mode == TokenizerMode.Content) { throw new Exception($"Unexpectedly encountered escape character '{EscapeCharacter}'"); } if (!encounteredEscapeCharacter) { encounteredEscapeCharacter = true; buffer.Append((char)reader.Read()); continue; } } var ignoreDelimiter = encounteredEscapeCharacter; encounteredEscapeCharacter = false; // TODO: support multiCharacter delimiters (eg /* */ in vJASS) if (!ignoreDelimiter && IsCharacterDelimiter(peek)) { if (buffer.Length == 0) { if (peek == -1) { break; } var peekChar = (char)peek; if (SyntaxToken.TryTokenizeSingleSymbol(peekChar, out var token)) { reader.Read(); peek = reader.Peek(); if (peek != -1 && SyntaxToken.TryTokenizeKeyword($"{peekChar}{(char)peek}", out var token2)) { reader.Read(); // Assumption: non-alphanumeric tokens are at most 2 characters (==, !=, <=, >=, //). yield return(TryUpdateMode(token2)); } else { yield return(TryUpdateMode(token)); } continue; } else if (char.IsWhiteSpace(peekChar)) { reader.Read(); continue; } else { // Handle 2-char symbols for which the first symbol on its own is not a token. reader.Read(); peek = reader.Read(); if (SyntaxToken.TryTokenizeKeyword($"{peekChar}{(char)peek}", out var token2)) { yield return(TryUpdateMode(token2)); continue; } throw new Exception($"Invalid sequence of symbols: {peekChar}{(char)peek}"); } } else { var tokenText = buffer.ToString(); buffer.Length = 0; if (_mode == TokenizerMode.Content && SyntaxToken.TryTokenizeKeyword(tokenText, out var token)) { yield return(TryUpdateMode(token)); } else { yield return(TryUpdateMode(new SyntaxToken(SyntaxToken.GetAlphanumericalTokenType(tokenText, _mode), tokenText))); } continue; } } buffer.Append((char)reader.Read()); } yield return(new SyntaxToken(SyntaxTokenType.EndOfFile)); } }
public ViterbiBuilder(DoubleArrayTrie doubleArrayTrie, TokenInfoDictionary dictionary, UnknownDictionary unknownDictionary, UserDictionary userDictionary, TokenizerMode mode) { DoubleArrayTrie = doubleArrayTrie; Dictionary = dictionary; UnknownDictionary = unknownDictionary; UserDictionary = userDictionary; UseUserDictionary = userDictionary != null; SearchMode = mode == TokenizerMode.Search || mode == TokenizerMode.Extended; CharacterDefinitions = UnknownDictionary.CharacterDefinition; }
public IEnumerable<Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true) { var result = new List<Token>(); var start = 0; if (mode == TokenizerMode.Default) { foreach (var w in Cut(text, hmm: hmm)) { var width = w.Length; result.Add(new Token(w, start, start + width)); start += width; } } else { foreach (var w in Cut(text, hmm: hmm)) { var width = w.Length; if (width > 2) { for (var i = 0; i < width - 1; i++) { var gram2 = w.Substring(i, 2); if (WordDict.ContainsWord(gram2)) { result.Add(new Token(gram2, start + i, start + i + 2)); } } } if (width > 3) { for (var i = 0; i < width - 2; i++) { var gram3 = w.Substring(i, 3); if (WordDict.ContainsWord(gram3)) { result.Add(new Token(gram3, start + i, start + i + 3)); } } } result.Add(new Token(w, start, start + width)); start += width; } } return result; }
public RouteTokenizer() { _mode = TokenizerMode.Default; }
private void SetTokenizerMode(TokenizerMode mode) { if (mode != this._tokenizer.Mode && this._ungotToken != null && !this._ungotToken.Kind.HasTrait(TokenFlags.ParseModeInvariant)) { this.Resync(this._ungotToken); } this._tokenizer.Mode = mode; }
private void SetTokenizerMode(TokenizerMode mode) { if (mode != _tokenizer.Mode && _ungotToken != null) { // Only rescan tokens that differ b/w command and expression modes. if (!_ungotToken.Kind.HasTrait(TokenFlags.ParseModeInvariant)) { Resync(_ungotToken); } #if DEBUG else if (_ungotToken.Kind != TokenKind.EndOfInput) { // Verify the comment above. Token ungotToken = _ungotToken; var oldTokenList = _tokenizer.TokenList; _tokenizer.TokenList = null; Resync(_ungotToken); _tokenizer.Mode = mode; Token rescan = _tokenizer.NextToken(); Diagnostics.Assert(ungotToken.Kind == rescan.Kind, "Rescan failed to return same kind"); Diagnostics.Assert(ungotToken.Text == rescan.Text, "Rescan failed to return same text"); IScriptPosition pos1 = ungotToken.Extent.StartScriptPosition; IScriptPosition pos2 = rescan.Extent.StartScriptPosition; Diagnostics.Assert(pos1.ColumnNumber == pos2.ColumnNumber, "Rescan failed to return same start column"); Diagnostics.Assert(pos1.LineNumber == pos2.LineNumber, "Rescan failed to return same start line#"); pos1 = ungotToken.Extent.EndScriptPosition; pos2 = rescan.Extent.EndScriptPosition; Diagnostics.Assert(pos1.ColumnNumber == pos2.ColumnNumber, "Rescan failed to return same end column"); Diagnostics.Assert(pos1.LineNumber == pos2.LineNumber, "Rescan failed to return same end line#"); // Make sure we leave things as they were - Resync clears _ungotToken. _ungotToken = ungotToken; _tokenizer.TokenList = oldTokenList; } #endif } _tokenizer.Mode = mode; }
/// <summary> /// /// </summary> /// <param name="Mode">只有两种模式,一般来说写入用Search,查询用默认</param> public Analyzer1(TokenizerMode Mode) : base() { this.Mode = Mode; }
public RouteToken[] Tokenize(string route) { if (route.Length < 1) throw new RouteParsingException(route, 0, "The route is empty."); var reader = new StringReader(route); var token = new List<RouteToken>(); int startPos = -1; char current, next; int position = -1; while (reader.Peek() != -1) { position++; current = (char)reader.Read(); next = (char)reader.Peek(); switch (_mode) { case TokenizerMode.Default: if (current == '/') { token.Add(new SeparatorToken(position)); continue; } if (current == '{') { token.Add(new StartParameterToken(position)); continue; } if (current == '}') { token.Add(new EndParameterToken(position)); continue; } if (current == '*') { token.Add(new WildcardToken(position)); continue; } if (char.IsDigit(current)) { if (!char.IsDigit(next)) { token.Add(new NumberToken(ParseNumber(route, position, 1), position, 1)); continue; } _mode = TokenizerMode.Number; startPos = position; continue; } if (char.IsLetter(current)) { if (!char.IsLetterOrDigit(next)) { token.Add(new NameToken(current.ToString(), position, 1)); continue; } _mode = TokenizerMode.Name; startPos = position; continue; } throw new RouteParsingException(route, position, "Unexpected character {0}.", current); case TokenizerMode.Number: if (!char.IsDigit(next)) { token.Add(new NumberToken(ParseNumber(route, startPos, position - startPos + 1), startPos, position - startPos + 1)); _mode = TokenizerMode.Default; } continue; case TokenizerMode.Name: if (!char.IsLetterOrDigit(next)) { token.Add(new NameToken(route.Substring(startPos, position - startPos + 1), startPos, position - startPos + 1)); _mode = TokenizerMode.Default; } continue; } } return token.ToArray(); }
public IEnumerable <Segmenter.Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search) { return(segmenter.Tokenize(text, mode)); }
public JieBaAnalyzer(TokenizerMode Mode, bool defaultUserDict = false) : base() { _mode = Mode; _defaultUserDict = defaultUserDict; }
public IEnumerable<JiebaNet.Segmenter.Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search) { return segmenter.Tokenize(text, mode); }
public MultiSearcher(ConnectionCosts costs, TokenizerMode mode, ViterbiSearcher viterbiSearcher) { Costs = costs; Mode = mode; ViterbiSearcher = viterbiSearcher; }
public JiebaAnalyzer(TokenizerMode mode) { _mode = mode; }
/// <summary> /// /// </summary> /// <param name="textReader"></param> /// <param name="Mode"></param> public Tokenizer1(TextReader textReader, TokenizerMode Mode) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, textReader) { }