Пример #1
0
 public void Load(BinarySerializer reader)
 {
     Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null);
     // the following statements throw serialization-related exceptions
     mType        = (TokenizerType)reader.ReadInt();
     mMinTokenLen = reader.ReadInt();
 }
Пример #2
0
        public TokenizeConfig(string configStr)
        {
            foreach (var kvp in StringOperations.ParseStringStringDictionary(configStr))
            {
                switch (kvp.Key)
                {
                case "TokenizerType":
                    TokenizerType = (TokenizerType)StringOperations.ParseEnum(typeof(TokenizerType), kvp.Value);
                    break;

                case "StopWordFile":
                    StopWordFile = kvp.Value;
                    break;

                case "AddStopWordsFile":
                    AddStopWordsFile = kvp.Value.Length > 0 ? kvp.Value : null;
                    break;

                case "UserDictFile":
                    UserDictFile = kvp.Value.Length > 0 ? kvp.Value : null;
                    break;
                }
            }

            Initialize();
        }
Пример #3
0
        public TokenizeConfig(TokenizerType TokenizerType,
                              string StopWordFile = null, string AddStopWordsFile = null,
                              string UserDictFile = null)
        {
            this.TokenizerType    = TokenizerType;
            this.StopWordFile     = StopWordFile;
            this.AddStopWordsFile = AddStopWordsFile;
            this.UserDictFile     = UserDictFile;

            Initialize();
        }
Пример #4
0
        public void AddTokensFromString(string str, TokenizerType tokType)
        {
            Utils.ThrowException(str == null ? new ArgumentNullException("str") : null);
            Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null);
            SimpleTokenizer tokenizer = new SimpleTokenizer();

            tokenizer.Type = tokType;
            foreach (string token in tokenizer.GetTokens(str))
            {
                AddToken(token.ToUpper());
            }
        }
Пример #5
0
        private ITokenizer GetTokenizer(TokenizerType rowProviderType, Stream stream)
        {
            switch (rowProviderType)
            {
            case TokenizerType.Pipeline: return(new PipelineStreamTokenizer(stream));

            case TokenizerType.String: return(new StreamReaderTokenizer(new StreamReader(stream)));

            case TokenizerType.ArrayPool: return(new MemoryManagedTokenizer(stream));

            default: throw new InvalidDataException();
            }
        }
Пример #6
0
        public async Task BasicCsvParse(TokenizerType rowProviderType)
        {
            using (var stream = GetStreamFromString("foo,bar,chunky,bacon"))
                using (var csv = GetTokenizer(rowProviderType, stream))
                {
                    var row = await csv.GetNextAsync();

                    Assert.Equal(4, row.Count());

                    Assert.Equal("foo", row[0].ToString());
                    Assert.Equal("bar", row[1].ToString());
                    Assert.Equal("chunky", row[2].ToString());
                    Assert.Equal("bacon", row[3].ToString());
                }
        }
Пример #7
0
        public void AddTokensFromFile(string file, TokenizerType tokType, Encoding loadAs)
        {
            Utils.ThrowException(!Utils.VerifyFileNameOpen(file) ? new ArgumentValueException("file") : null);
            Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null);
            StreamReader    reader    = new StreamReader(file, loadAs);
            SimpleTokenizer tokenizer = new SimpleTokenizer();

            tokenizer.Type = tokType;
            string line;

            while ((line = reader.ReadLine()) != null)
            {
                foreach (string token in tokenizer.GetTokens(line))
                {
                    AddToken(token.ToUpper());
                }
            }
            reader.Close();
        }
Пример #8
0
        static void CompareTokens(TokenizerType typeA, List <CToken> tokensA, TokenizerType typeB, List <CToken> tokensB)
        {
            int maxCount = Math.Max(tokensA.Count, tokensB.Count);

            for (int i = 0; i < maxCount; ++i)
            {
                CToken?tokenA = (i < tokensA.Count) ? new CToken?(tokensA[i]) : null;
                CToken?tokenB = (i < tokensB.Count) ? new CToken?(tokensB[i]) : null;
                if (tokenA.HasValue && !tokenB.HasValue)
                {
                    Console.WriteLine($"Missing token B at index {i} [{typeB}] -> Found A [{typeA}] = {tokenA.Value.Start}");
                    break;
                }
                else if (tokenB.HasValue && !tokenA.HasValue)
                {
                    Console.WriteLine($"Missing token A at index {i} [{typeA}] -> Found B [{typeB}] = {tokenB.Value.Start}");
                    break;
                }
                else if (!tokenA.HasValue && !tokenB.HasValue)
                {
                    Console.WriteLine($"Missing any token at index {i}");
                    break;
                }
                else
                {
                    if (tokenA.Value.Kind != tokenB.Value.Kind)
                    {
                        Console.WriteLine($"Different token kinds ({typeA}:{tokenA.Value.Kind} vs {typeB}:{tokenB.Value.Kind}) on index {i}");
                    }
                    if (tokenA.Value.Start.Index != tokenB.Value.Start.Index)
                    {
                        Console.WriteLine($"Different token starts ({typeA}:{tokenA.Value.Start} vs {typeB}:{tokenB.Value.Start}) on index {i}");
                    }
                    if (tokenA.Value.Length != tokenB.Value.Length)
                    {
                        Console.WriteLine($"Different token lengths ({typeA}:{tokenA.Value.Length} vs {typeB}:{tokenB.Value.Length}) on index {i}");
                    }
                }
            }
        }
Пример #9
0
        public async Task MultilineCsvParse(TokenizerType rowProviderType)
        {
            StringBuilder sb = new StringBuilder();

            sb.AppendLine("foo,bar,chunky,bacon");
            sb.AppendLine("bacon,is,very,chunky");

            for (int i = 0; i < 10000; i++)
            {
                sb.AppendLine("this,is,lots,of,lines");
            }

            using (var stream = GetStreamFromString(sb.ToString()))
                using (var csv = GetTokenizer(rowProviderType, stream))
                {
                    var row = await csv.GetNextAsync();

                    Assert.Equal(4, row.Count());

                    Assert.Equal("foo", row[0].ToString());
                    Assert.Equal("bar", row[1].ToString());
                    Assert.Equal("chunky", row[2].ToString());
                    Assert.Equal("bacon", row[3].ToString());

                    row = await csv.GetNextAsync();

                    Assert.Equal(4, row.Count());

                    Assert.Equal("bacon", row[0].ToString());
                    Assert.Equal("is", row[1].ToString());
                    Assert.Equal("very", row[2].ToString());
                    Assert.Equal("chunky", row[3].ToString());

                    do
                    {
                        row = await csv.GetNextAsync();
                    } while (row != null);
                }
        }
Пример #10
0
 internal Enumerator(string text, TokenizerType type, int minTokenLen)
 {
     mType        = type;
     mText        = text;
     mMinTokenLen = minTokenLen;
 }
Пример #11
0
 public void AddTokensFromString(string str, TokenizerType tokType)
 {
     Utils.ThrowException(str == null ? new ArgumentNullException("str") : null);
     Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null);
     SimpleTokenizer tokenizer = new SimpleTokenizer();
     tokenizer.Type = tokType;
     tokenizer.Text = str;
     AddTokens(tokenizer);
 }
Пример #12
0
 public void AddTokensFromFile(string file, TokenizerType tokType)
 {
     Utils.ThrowException(!Utils.VerifyFileNameOpen(file) ? new ArgumentValueException("file") : null);
     Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null);
     StreamReader reader = Utils.GetUnicodeSignature(file) != null ? new StreamReader(file) : new StreamReader(file, Encoding.UTF8);
     SimpleTokenizer tokenizer = new SimpleTokenizer();
     tokenizer.Type = tokType;
     string line;
     while ((line = reader.ReadLine()) != null)
     {
         tokenizer.Text = line;
         foreach (string token in tokenizer)
         {
             AddToken(token.ToUpper());
         }
     }
     reader.Close();
 }