public void Load(BinarySerializer reader) { Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null); // the following statements throw serialization-related exceptions mType = (TokenizerType)reader.ReadInt(); mMinTokenLen = reader.ReadInt(); }
public TokenizeConfig(string configStr) { foreach (var kvp in StringOperations.ParseStringStringDictionary(configStr)) { switch (kvp.Key) { case "TokenizerType": TokenizerType = (TokenizerType)StringOperations.ParseEnum(typeof(TokenizerType), kvp.Value); break; case "StopWordFile": StopWordFile = kvp.Value; break; case "AddStopWordsFile": AddStopWordsFile = kvp.Value.Length > 0 ? kvp.Value : null; break; case "UserDictFile": UserDictFile = kvp.Value.Length > 0 ? kvp.Value : null; break; } } Initialize(); }
public TokenizeConfig(TokenizerType TokenizerType, string StopWordFile = null, string AddStopWordsFile = null, string UserDictFile = null) { this.TokenizerType = TokenizerType; this.StopWordFile = StopWordFile; this.AddStopWordsFile = AddStopWordsFile; this.UserDictFile = UserDictFile; Initialize(); }
public void AddTokensFromString(string str, TokenizerType tokType) { Utils.ThrowException(str == null ? new ArgumentNullException("str") : null); Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null); SimpleTokenizer tokenizer = new SimpleTokenizer(); tokenizer.Type = tokType; foreach (string token in tokenizer.GetTokens(str)) { AddToken(token.ToUpper()); } }
private ITokenizer GetTokenizer(TokenizerType rowProviderType, Stream stream) { switch (rowProviderType) { case TokenizerType.Pipeline: return(new PipelineStreamTokenizer(stream)); case TokenizerType.String: return(new StreamReaderTokenizer(new StreamReader(stream))); case TokenizerType.ArrayPool: return(new MemoryManagedTokenizer(stream)); default: throw new InvalidDataException(); } }
public async Task BasicCsvParse(TokenizerType rowProviderType) { using (var stream = GetStreamFromString("foo,bar,chunky,bacon")) using (var csv = GetTokenizer(rowProviderType, stream)) { var row = await csv.GetNextAsync(); Assert.Equal(4, row.Count()); Assert.Equal("foo", row[0].ToString()); Assert.Equal("bar", row[1].ToString()); Assert.Equal("chunky", row[2].ToString()); Assert.Equal("bacon", row[3].ToString()); } }
public void AddTokensFromFile(string file, TokenizerType tokType, Encoding loadAs) { Utils.ThrowException(!Utils.VerifyFileNameOpen(file) ? new ArgumentValueException("file") : null); Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null); StreamReader reader = new StreamReader(file, loadAs); SimpleTokenizer tokenizer = new SimpleTokenizer(); tokenizer.Type = tokType; string line; while ((line = reader.ReadLine()) != null) { foreach (string token in tokenizer.GetTokens(line)) { AddToken(token.ToUpper()); } } reader.Close(); }
static void CompareTokens(TokenizerType typeA, List <CToken> tokensA, TokenizerType typeB, List <CToken> tokensB) { int maxCount = Math.Max(tokensA.Count, tokensB.Count); for (int i = 0; i < maxCount; ++i) { CToken?tokenA = (i < tokensA.Count) ? new CToken?(tokensA[i]) : null; CToken?tokenB = (i < tokensB.Count) ? new CToken?(tokensB[i]) : null; if (tokenA.HasValue && !tokenB.HasValue) { Console.WriteLine($"Missing token B at index {i} [{typeB}] -> Found A [{typeA}] = {tokenA.Value.Start}"); break; } else if (tokenB.HasValue && !tokenA.HasValue) { Console.WriteLine($"Missing token A at index {i} [{typeA}] -> Found B [{typeB}] = {tokenB.Value.Start}"); break; } else if (!tokenA.HasValue && !tokenB.HasValue) { Console.WriteLine($"Missing any token at index {i}"); break; } else { if (tokenA.Value.Kind != tokenB.Value.Kind) { Console.WriteLine($"Different token kinds ({typeA}:{tokenA.Value.Kind} vs {typeB}:{tokenB.Value.Kind}) on index {i}"); } if (tokenA.Value.Start.Index != tokenB.Value.Start.Index) { Console.WriteLine($"Different token starts ({typeA}:{tokenA.Value.Start} vs {typeB}:{tokenB.Value.Start}) on index {i}"); } if (tokenA.Value.Length != tokenB.Value.Length) { Console.WriteLine($"Different token lengths ({typeA}:{tokenA.Value.Length} vs {typeB}:{tokenB.Value.Length}) on index {i}"); } } } }
public async Task MultilineCsvParse(TokenizerType rowProviderType) { StringBuilder sb = new StringBuilder(); sb.AppendLine("foo,bar,chunky,bacon"); sb.AppendLine("bacon,is,very,chunky"); for (int i = 0; i < 10000; i++) { sb.AppendLine("this,is,lots,of,lines"); } using (var stream = GetStreamFromString(sb.ToString())) using (var csv = GetTokenizer(rowProviderType, stream)) { var row = await csv.GetNextAsync(); Assert.Equal(4, row.Count()); Assert.Equal("foo", row[0].ToString()); Assert.Equal("bar", row[1].ToString()); Assert.Equal("chunky", row[2].ToString()); Assert.Equal("bacon", row[3].ToString()); row = await csv.GetNextAsync(); Assert.Equal(4, row.Count()); Assert.Equal("bacon", row[0].ToString()); Assert.Equal("is", row[1].ToString()); Assert.Equal("very", row[2].ToString()); Assert.Equal("chunky", row[3].ToString()); do { row = await csv.GetNextAsync(); } while (row != null); } }
internal Enumerator(string text, TokenizerType type, int minTokenLen) { mType = type; mText = text; mMinTokenLen = minTokenLen; }
public void AddTokensFromString(string str, TokenizerType tokType) { Utils.ThrowException(str == null ? new ArgumentNullException("str") : null); Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null); SimpleTokenizer tokenizer = new SimpleTokenizer(); tokenizer.Type = tokType; tokenizer.Text = str; AddTokens(tokenizer); }
public void AddTokensFromFile(string file, TokenizerType tokType) { Utils.ThrowException(!Utils.VerifyFileNameOpen(file) ? new ArgumentValueException("file") : null); Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null); StreamReader reader = Utils.GetUnicodeSignature(file) != null ? new StreamReader(file) : new StreamReader(file, Encoding.UTF8); SimpleTokenizer tokenizer = new SimpleTokenizer(); tokenizer.Type = tokType; string line; while ((line = reader.ReadLine()) != null) { tokenizer.Text = line; foreach (string token in tokenizer) { AddToken(token.ToUpper()); } } reader.Close(); }