RegexTokenizer C# (CSharp)代码示例

示例#1

0

显示文件

        public void Start()
        {
            string path           = "SqlStatements.json";
            var    jsonSerializer = new JsonFileSerializer();

            while (true)
            {
                Console.WriteLine($"Press 1 : Tokenize list of sql statements{Environment.NewLine}");

                var selection = Console.ReadKey();

                switch (selection.KeyChar.ToString())
                {
                case "1":
                    foreach (var statement in jsonSerializer.ParseSqlStrings(path))
                    {
                        var tonkenizer = new RegexTokenizer();

                        Console.WriteLine($"Outputing token list:{Environment.NewLine}");
                        OutputTokenList(tonkenizer, statement);
                    }
                    break;

                default:
                    Console.WriteLine("Enter a valid selection.");
                    break;
                }
            }
        }

示例#2

0

显示文件

        public HtmlTokenizer(string text, IStemmer stemmer, bool decodeTextBlocks, bool tokenizeTextBlocks, bool applySkipRules)
        {
            Utils.ThrowException(text == null ? new ArgumentNullException("text") : null);
            mText               = text;
            mStemmer            = stemmer;
            mDecodeTextBlocks   = decodeTextBlocks;
            mTokenizeTextBlocks = tokenizeTextBlocks;
            mApplySkipRules     = applySkipRules;
            HtmlDocument htmlDoc = new HtmlDocument();

            Configure(htmlDoc);
            htmlDoc.LoadHtml(text);
            HtmlNodeCollection nodes = new HtmlNodeCollection(/*parentNode=*/ null);

            nodes.Add(htmlDoc.DocumentNode);
            RegexTokenizer textBlockTokenizer = null;

            if (mTokenizeTextBlocks)
            {
                textBlockTokenizer                     = new RegexTokenizer();
                textBlockTokenizer.TokenRegex          = string.Format("({0})|({1})", mWordRegexStr, mNumberRegexStr);
                textBlockTokenizer.IgnoreUnknownTokens = true;
            }
            CreateTokens(nodes, textBlockTokenizer);
        }

示例#3

0

显示文件

文件： PatternAnalyzer.cs 项目： mundher/lucene.net

        /*
         * Creates a token stream that tokenizes the given string into token terms
         * (aka words).
         *
         * @param fieldName
         *            the name of the field to tokenize (currently ignored).
         * @param text
         *            the string to tokenize
         * @return a new token stream
         */
        public TokenStream TokenStream(String fieldName, String text)
        {
            // Ideally the Analyzer superclass should have a method with the same signature,
            // with a default impl that simply delegates to the StringReader flavour.
            if (text == null)
            {
                throw new ArgumentException("text must not be null");
            }

            TokenStream stream;

            if (Regex == NON_WORD_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
            }
            else if (Regex == WHITESPACE_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
            }
            else
            {
                stream = new RegexTokenizer(text, Regex, toLowerCase);
                if (stopWords != null)
                {
                    stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
                }
            }

            return(stream);
        }

示例#4

0

显示文件

        public void Tokenize_IsInvalidZeroToken()
        {
            List <string> listValue = new List <string>();

            listValue.Add(".00");

            ITokenizer          tokenizer = new RegexTokenizer();
            IEnumerable <Token> tokens    = tokenizer.Tokenize(listValue);
        }

示例#5

0

显示文件

        public void EndingMatchTest()
        {
            const string text = "Error in machine ID 123";
            var          sut  = new RegexTokenizer(@"Machine id (\d+)", new Color(ConsoleColor.Blue), RegexOptions.IgnoreCase);

            Assert.Collection(sut.Parse(text),
                              x => Assert.Equal("Error in machine ID ", x.Text),
                              x => Assert.Equal("123", x.Text)
                              );
        }

示例#6

0

显示文件

        public void BeginningMatchTest()
        {
            const string text = "123 some text";
            var          sut  = new RegexTokenizer(@"\d+", new Color(ConsoleColor.Blue));

            Assert.Collection(sut.Parse(text),
                              x => Assert.Equal("123", x.Text),
                              x => Assert.Equal(" some text", x.Text)
                              );
        }

示例#7

0

显示文件

        public void NonCapturingGroupIsIgnored()
        {
            const string text = "2019/07/07 ERR some error message";
            var          sut  = new RegexTokenizer(@"\d{4}\/\d{2}\/\d{2} (?:ERR)*\s*(.+)", new Color(ConsoleColor.Red));

            Assert.Collection(sut.Parse(text),
                              x => Assert.Equal("2019/07/07 ERR ", x.Text),
                              x => Assert.Equal("some error message", x.Text)
                              );
        }

示例#8

0

显示文件

        public void InMiddleMatchTest()
        {
            const string text = "2019/07/07 ERR error message";
            var          sut  = new RegexTokenizer("ERR", new Color(ConsoleColor.Yellow));

            Assert.Collection(sut.Parse(text),
                              x => Assert.Equal("2019/07/07 ", x.Text),
                              x => Assert.Equal("ERR", x.Text),
                              x => Assert.Equal(" error message", x.Text)
                              );
        }

示例#9

0

显示文件

        public TokenDefintion(string pattern)
        {
            Priority = -1;
            Ignore   = true;

            var tokenizer = new RegexTokenizer();
            var tokens    = tokenizer.Tokenize(pattern);
            var compiler  = new RegexParser(tokens);

            Regex = compiler.Parse();
        }

示例#10

0

显示文件

        /// <summary>
        ///
        /// </summary>
        /// <param name="tokenizer"></param>
        /// <param name="sqlStatements"></param>
        public void OutputTokenList(RegexTokenizer tokenizer, SqlStatement sqlStatement)
        {
            sqlStatement.tokenList.AddRange(tokenizer.Tokenize(sqlStatement.StatementValue));

            sqlStatement.tokenList.ForEach(x => Console.WriteLine($"Token Type: {x.TokenType}, Token Value: {x.Value}"));

            //var temp = parser.Parse(sqlStatement.tokenList);
            OutputParseTree(sqlStatement);

            Console.WriteLine($"End of statement.{Environment.NewLine}");
        }

示例#11

0

显示文件

        public TokenDefintion(TType type, string pattern, int priority)
        {
            Type     = type;
            Priority = priority;
            Ignore   = false;

            var tokenizer = new RegexTokenizer();
            var tokens    = tokenizer.Tokenize(pattern);
            var compiler  = new RegexParser(tokens);

            Regex = compiler.Parse();
        }

示例#12

0

显示文件

文件： Corpus.cs 项目： mgrcar/Detextive

        public void LoadFromText(string text)
        {
            Utils.ThrowException(text == null ? new ArgumentNullException("text") : null);
            mTaggedWords.Clear();
            mTeiHeader = null;
            RegexTokenizer tokenizer = new RegexTokenizer();

            tokenizer.TokenRegex          = @"\p{L}+(-\p{L}+)*";
            tokenizer.IgnoreUnknownTokens = false;
            foreach (string word in tokenizer.GetTokens(text))
            {
                mTaggedWords.Add(new TaggedWord(word, /*tag=*/ null, /*lemma=*/ null));
            }
        }

示例#13

0

显示文件

        public void MultipleMatchesTest()
        {
            const string text = "Traffic light has Red, Yellow and Green colors";
            var          sut  = new RegexTokenizer(@"(Red|Yellow|Green)", new Color(ConsoleColor.Blue), RegexOptions.IgnoreCase);

            Assert.Collection(sut.Parse(text),
                              x => Assert.Equal("Traffic light has ", x.Text),
                              x => Assert.Equal("Red", x.Text),
                              x => Assert.Equal(", ", x.Text),
                              x => Assert.Equal("Yellow", x.Text),
                              x => Assert.Equal(" and ", x.Text),
                              x => Assert.Equal("Green", x.Text),
                              x => Assert.Equal(" colors", x.Text)
                              );
        }

示例#14

0

显示文件

        public void CanTokenizeAndParseQueryString()
        {
            string query = @"
MATCH app = 'MyTestApp'
AND ex IN ('System.NullReferenceException', 'System.FormatException')
BETWEEN 2016-01-01 00:00:00 AND 2016-02-01 00:00:00
LIMIT 100
";

            var tokenizer     = new RegexTokenizer();
            var tokenSequence = tokenizer.Tokenize(query).ToList();

            var parser             = new QueryParser();
            var dataRepresentation = parser.Parse(tokenSequence);

            var json = JsonConvert.SerializeObject(dataRepresentation, Formatting.Indented, SerializerSettings);

            Assert.NotNull(json);
        }

示例#15

0

显示文件

 private void CreateTokens(HtmlNodeCollection nodes, RegexTokenizer textBlockTokenizer)
 {
     foreach (HtmlNode node in nodes)
     {
         Token endTag;
         IEnumerable <Token> tokens = CreateToken(node, out endTag, textBlockTokenizer);
         if (tokens != null)
         {
             mTokenList.AddRange(tokens);
         }
         if (!mApplySkipRules || !mSkipTagList.Contains(node.Name.ToLower()))
         {
             CreateTokens(node.ChildNodes, textBlockTokenizer);
         }
         if (endTag != null)
         {
             mTokenList.Add(endTag);
         }
     }
 }

示例#16

0

显示文件

文件： ExpenseFileHelper.cs 项目： marbmp/CSharp-Tokenizer

        public void ValidateFile()
        {
            try
            {
                //Get all lines of the file as tokens
                ITokenizer tokenizer     = new RegexTokenizer();
                var        tokenSequence = tokenizer.Tokenize(_fileContent).ToList();
                //Parse all tokens
                Parser p = new Parser();
                _expenses = p.Parse(tokenSequence);

                if (_expenses.Count == 0)
                {
                    throw new Exception("Nothing to Process");
                }
            }
            catch (Exception)
            {
                throw;
            }
        }

示例#17

0

显示文件

文件： PatternAnalyzer.cs 项目： synhershko/lucene.net

        /**
         * Creates a token stream that tokenizes the given string into token terms
         * (aka words).
         * 
         * @param fieldName
         *            the name of the field to tokenize (currently ignored).
         * @param text
         *            the string to tokenize
         * @return a new token stream
         */
        public TokenStream TokenStream(String fieldName, String text)
        {
            // Ideally the Analyzer superclass should have a method with the same signature, 
            // with a default impl that simply delegates to the StringReader flavour. 
            if (text == null)
                throw new ArgumentException("text must not be null");

            TokenStream stream;
            if (Regex == NON_WORD_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
            }
            else if (Regex == WHITESPACE_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
            }
            else
            {
                stream = new RegexTokenizer(text, Regex, toLowerCase);
                if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
            }

            return stream;
        }

示例#18

0

显示文件

        static void Main(string[] args)
        {
            // load documents
            Utils.VerboseLine("Loading documents ...");
            string[] docs     = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt");
            BowSpace bowSpace = new BowSpace();

            bowSpace.StopWords      = StopWords.EnglishStopWords;
            bowSpace.Stemmer        = new PorterStemmer();
            bowSpace.WordWeightType = WordWeightType.TfIdf;
            RegexTokenizer tokenizer = new RegexTokenizer();

            tokenizer.IgnoreUnknownTokens = true;
            bowSpace.Tokenizer            = tokenizer;
            bowSpace.Initialize(docs);
            // compute layout
            SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace);

            Vector2D[] coords = semSpc.ComputeLayout();
            // build spatial index
            //Utils.VerboseLine("Building spatial index ...");
            //SpatialIndex2D spatIdx = new SpatialIndex2D();
            //spatIdx.BuildIndex(coords);
            //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000));
            //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1);
            //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count);

            ArrayList <Vector2D> tmp = new ArrayList <Vector2D>(coords);

            tmp.Shuffle();
            //tmp.RemoveRange(1000, tmp.Count - 1000);

            // compute elevation
            StreamWriter   writer = new StreamWriter("c:\\elev.txt");
            LayoutSettings ls     = new LayoutSettings(800, 600);

            ls.AdjustmentType = LayoutAdjustmentType.Soft;
            ls.StdDevMult     = 2;
            ls.FitToBounds    = true;
            ls.MarginVert     = 50;
            ls.MarginHoriz    = 50;
            double[,] zMtx    = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200);
            VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp");
            for (int row = 0; row < zMtx.GetLength(0); row++)
            {
                for (int col = 0; col < zMtx.GetLength(1); col++)
                {
                    writer.Write("{0}\t", zMtx[row, col]);
                }
                writer.WriteLine();
            }
            writer.Close();

            // output coordinates
            StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv");

            for (int i = 0; i < coords.Length; i++)
            {
                //if (i < points.Count)
                //{
                //    tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y);
                //}
                //else
                {
                    tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y);
                }
            }
            tsvWriter.Close();
            //// get document names
            //int k = 0;
            //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>();
            //foreach (string doc in docs)
            //{
            //    string[] docInfo = doc.Split(' ');
            //    layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++]));
            //}
            //Console.WriteLine(coords.Length);
            //Console.WriteLine(layout.Count);
            //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt");
            //foreach (Pair<string, Vector2D> docPos in layout)
            //{
            //    writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y);
            //}
            //writer.Close();
        }

示例#19

0

显示文件

文件： ProgramOld.cs 项目： viidea/latino

        static void Main(string[] args)
        {
            // load documents
            Utils.VerboseLine("Loading documents ...");
            string[] docs = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt");
            BowSpace bowSpace = new BowSpace();
            bowSpace.StopWords = StopWords.EnglishStopWords;
            bowSpace.Stemmer = new PorterStemmer();
            bowSpace.WordWeightType = WordWeightType.TfIdf;
            RegexTokenizer tokenizer = new RegexTokenizer();
            tokenizer.IgnoreUnknownTokens = true;
            bowSpace.Tokenizer = tokenizer;
            bowSpace.Initialize(docs);
            // compute layout
            SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace);
            Vector2D[] coords = semSpc.ComputeLayout();
            // build spatial index
            //Utils.VerboseLine("Building spatial index ...");
            //SpatialIndex2D spatIdx = new SpatialIndex2D();
            //spatIdx.BuildIndex(coords);
            //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000));
            //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1);
            //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count);

            ArrayList<Vector2D> tmp = new ArrayList<Vector2D>(coords);
            tmp.Shuffle();
            //tmp.RemoveRange(1000, tmp.Count - 1000);

            // compute elevation
            StreamWriter writer = new StreamWriter("c:\\elev.txt");
            LayoutSettings ls = new LayoutSettings(800, 600);
            ls.AdjustmentType = LayoutAdjustmentType.Soft;
            ls.StdDevMult = 2;
            ls.FitToBounds = true;
            ls.MarginVert = 50;
            ls.MarginHoriz = 50;
            double[,] zMtx = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200);
            VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp");
            for (int row = 0; row < zMtx.GetLength(0); row++)
            {
                for (int col = 0; col < zMtx.GetLength(1); col++)
                {
                    writer.Write("{0}\t", zMtx[row, col]);
                }
                writer.WriteLine();
            }
            writer.Close();

            // output coordinates
            StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv");
            for (int i = 0; i < coords.Length; i++)
            {
                //if (i < points.Count)
                //{
                //    tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y);
                //}
                //else
                {
                    tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y);
                }
            }
            tsvWriter.Close();
            //// get document names
            //int k = 0;
            //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>();
            //foreach (string doc in docs)
            //{
            //    string[] docInfo = doc.Split(' ');
            //    layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++]));
            //}
            //Console.WriteLine(coords.Length);
            //Console.WriteLine(layout.Count);
            //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt");
            //foreach (Pair<string, Vector2D> docPos in layout)
            //{
            //    writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y);
            //}
            //writer.Close();
        }

示例#20

0

显示文件

        private IEnumerable <Token> CreateToken(HtmlNode node, out Token endTag, RegexTokenizer textBlockTokenizer)
        {
            IEnumerable <Token> tokens = null;

            endTag = null;
            if (node.NodeType == HtmlNodeType.Element)
            {
                // case 1: open tag like <i> without </i> (inside another tag like <b><i></b>)
                if (node._innerlength <= 0 && node._outerlength <= 0)
                {
                    Token token = new Token();
                    token.mTokenType  = TokenType.OpenTag;
                    token.mStartIndex = node._outerstartindex;
                    token.mLength     = node._innerstartindex - node._outerstartindex;
                    token.mTokenStr   = mText.Substring(token.mStartIndex, token.mLength);
                    token.mTagName    = node.Name.ToLower();
                    tokens            = new Token[] { token };
                }
                // case 2: open tag like <i> without </i> (other cases)
                else if (node._innerlength <= 0 && node.EndNode == null)
                {
                    Token token = new Token();
                    token.mTokenType  = TokenType.OpenTag;
                    token.mStartIndex = node._outerstartindex;
                    token.mLength     = node._outerlength;
                    token.mTokenStr   = mText.Substring(token.mStartIndex, token.mLength);
                    token.mTagName    = node.Name.ToLower();
                    tokens            = new Token[] { token };
                }
                // case 3: empty tag like <br> or <br/>
                else if (node._innerlength <= 0)
                {
                    if (node.EndNode._outerstartindex != node._outerstartindex) // handle <tag></tag> pair
                    {
                        string startTagStr = mText.Substring(node._outerstartindex, node.EndNode._outerstartindex - node._outerstartindex);
                        Token  firstTag    = new Token();
                        firstTag.mTokenType  = TokenType.StartTag;
                        firstTag.mStartIndex = node._outerstartindex;
                        firstTag.mLength     = startTagStr.Length;
                        firstTag.mTokenStr   = startTagStr;
                        firstTag.mTagName    = node.Name.ToLower();
                        string endTagStr = mText.Substring(node.EndNode._outerstartindex, node.EndNode._outerlength);
                        Token  secondTag = new Token();
                        secondTag.mTokenType  = TokenType.EndTag;
                        secondTag.mStartIndex = firstTag.mStartIndex + firstTag.mLength;
                        secondTag.mLength     = endTagStr.Length;
                        secondTag.mTokenStr   = endTagStr;
                        secondTag.mTagName    = firstTag.mTagName;
                        tokens = new Token[] { firstTag, secondTag };
                    }
                    else // handle <tag/>
                    {
                        Token token = new Token();
                        token.mTokenType  = TokenType.EmptyTag;
                        token.mStartIndex = node._outerstartindex;
                        token.mLength     = node._outerlength;
                        token.mTokenStr   = mText.Substring(node._outerstartindex, node._outerlength);
                        token.mTagName    = node.Name.ToLower();
                        tokens            = new Token[] { token };
                    }
                }
                // case 4: closed tag like <b>some text</b>
                else
                {
                    Token token = new Token();
                    token.mTokenType   = TokenType.StartTag;
                    token.mStartIndex  = node._outerstartindex;
                    token.mLength      = node._innerstartindex - node._outerstartindex;
                    token.mTokenStr    = mText.Substring(token.mStartIndex, token.mLength);
                    token.mTagName     = node.Name.ToLower();
                    tokens             = new Token[] { token };
                    endTag             = new Token();
                    endTag.mTokenType  = TokenType.EndTag;
                    endTag.mStartIndex = node._innerstartindex + node._innerlength;
                    endTag.mLength     = node._outerstartindex + node._outerlength - endTag.mStartIndex;
                    endTag.mTokenStr   = mText.Substring(endTag.mStartIndex, endTag.mLength);
                    endTag.mTagName    = token.mTagName;
                }
            }
            else if (node.NodeType == HtmlNodeType.Text)
            {
                if (textBlockTokenizer == null)
                {
                    Token token = new Token();
                    token.mTokenType  = TokenType.Text;
                    token.mStartIndex = node._innerstartindex;
                    token.mLength     = node._innerlength;
                    token.mTokenStr   = mText.Substring(node._innerstartindex, node._innerlength);
                    if (mDecodeTextBlocks)
                    {
                        token.mTokenStr = HttpUtility.HtmlDecode(token.mTokenStr);
                    }
                    tokens = new Token[] { token };
                }
                else // tokenize text block
                {
                    tokens = new ArrayList <Token>();
                    string text = mText.Substring(node._innerstartindex, node._innerlength);
                    textBlockTokenizer.Text = mDecodeTextBlocks ? HttpUtility.HtmlDecode(text) : text;
                    RegexTokenizer.Enumerator tokEnum = (RegexTokenizer.Enumerator)textBlockTokenizer.GetEnumerator();
                    int baseIdx = node._innerstartindex;
                    while (tokEnum.MoveNext())
                    {
                        string tokenStr = tokEnum.Current;
                        Token  token    = new Token();
                        token.mTokenType = GetTokenType(tokenStr);
                        if (!mDecodeTextBlocks)
                        {
                            token.mStartIndex = baseIdx + tokEnum.CurrentTokenIdx;
                            token.mLength     = tokenStr.Length;
                        }
                        token.mTokenStr = tokenStr;
                        ((ArrayList <Token>)tokens).Add(token);
                    }
                    if (((ArrayList <Token>)tokens).Count == 0)
                    {
                        tokens = null;
                    }
                }
            }
            return(tokens);
        }

C# (CSharp) RegexTokenizer示例