public void Start() { string path = "SqlStatements.json"; var jsonSerializer = new JsonFileSerializer(); while (true) { Console.WriteLine($"Press 1 : Tokenize list of sql statements{Environment.NewLine}"); var selection = Console.ReadKey(); switch (selection.KeyChar.ToString()) { case "1": foreach (var statement in jsonSerializer.ParseSqlStrings(path)) { var tonkenizer = new RegexTokenizer(); Console.WriteLine($"Outputing token list:{Environment.NewLine}"); OutputTokenList(tonkenizer, statement); } break; default: Console.WriteLine("Enter a valid selection."); break; } } }
public HtmlTokenizer(string text, IStemmer stemmer, bool decodeTextBlocks, bool tokenizeTextBlocks, bool applySkipRules) { Utils.ThrowException(text == null ? new ArgumentNullException("text") : null); mText = text; mStemmer = stemmer; mDecodeTextBlocks = decodeTextBlocks; mTokenizeTextBlocks = tokenizeTextBlocks; mApplySkipRules = applySkipRules; HtmlDocument htmlDoc = new HtmlDocument(); Configure(htmlDoc); htmlDoc.LoadHtml(text); HtmlNodeCollection nodes = new HtmlNodeCollection(/*parentNode=*/ null); nodes.Add(htmlDoc.DocumentNode); RegexTokenizer textBlockTokenizer = null; if (mTokenizeTextBlocks) { textBlockTokenizer = new RegexTokenizer(); textBlockTokenizer.TokenRegex = string.Format("({0})|({1})", mWordRegexStr, mNumberRegexStr); textBlockTokenizer.IgnoreUnknownTokens = true; } CreateTokens(nodes, textBlockTokenizer); }
/* * Creates a token stream that tokenizes the given string into token terms * (aka words). * * @param fieldName * the name of the field to tokenize (currently ignored). * @param text * the string to tokenize * @return a new token stream */ public TokenStream TokenStream(String fieldName, String text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (text == null) { throw new ArgumentException("text must not be null"); } TokenStream stream; if (Regex == NON_WORD_PATTERN) { // fast path stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); } else if (Regex == WHITESPACE_PATTERN) { // fast path stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); } else { stream = new RegexTokenizer(text, Regex, toLowerCase); if (stopWords != null) { stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords); } } return(stream); }
public void Tokenize_IsInvalidZeroToken() { List <string> listValue = new List <string>(); listValue.Add(".00"); ITokenizer tokenizer = new RegexTokenizer(); IEnumerable <Token> tokens = tokenizer.Tokenize(listValue); }
public void EndingMatchTest() { const string text = "Error in machine ID 123"; var sut = new RegexTokenizer(@"Machine id (\d+)", new Color(ConsoleColor.Blue), RegexOptions.IgnoreCase); Assert.Collection(sut.Parse(text), x => Assert.Equal("Error in machine ID ", x.Text), x => Assert.Equal("123", x.Text) ); }
public void BeginningMatchTest() { const string text = "123 some text"; var sut = new RegexTokenizer(@"\d+", new Color(ConsoleColor.Blue)); Assert.Collection(sut.Parse(text), x => Assert.Equal("123", x.Text), x => Assert.Equal(" some text", x.Text) ); }
public void NonCapturingGroupIsIgnored() { const string text = "2019/07/07 ERR some error message"; var sut = new RegexTokenizer(@"\d{4}\/\d{2}\/\d{2} (?:ERR)*\s*(.+)", new Color(ConsoleColor.Red)); Assert.Collection(sut.Parse(text), x => Assert.Equal("2019/07/07 ERR ", x.Text), x => Assert.Equal("some error message", x.Text) ); }
public void InMiddleMatchTest() { const string text = "2019/07/07 ERR error message"; var sut = new RegexTokenizer("ERR", new Color(ConsoleColor.Yellow)); Assert.Collection(sut.Parse(text), x => Assert.Equal("2019/07/07 ", x.Text), x => Assert.Equal("ERR", x.Text), x => Assert.Equal(" error message", x.Text) ); }
public TokenDefintion(string pattern) { Priority = -1; Ignore = true; var tokenizer = new RegexTokenizer(); var tokens = tokenizer.Tokenize(pattern); var compiler = new RegexParser(tokens); Regex = compiler.Parse(); }
/// <summary> /// /// </summary> /// <param name="tokenizer"></param> /// <param name="sqlStatements"></param> public void OutputTokenList(RegexTokenizer tokenizer, SqlStatement sqlStatement) { sqlStatement.tokenList.AddRange(tokenizer.Tokenize(sqlStatement.StatementValue)); sqlStatement.tokenList.ForEach(x => Console.WriteLine($"Token Type: {x.TokenType}, Token Value: {x.Value}")); //var temp = parser.Parse(sqlStatement.tokenList); OutputParseTree(sqlStatement); Console.WriteLine($"End of statement.{Environment.NewLine}"); }
public TokenDefintion(TType type, string pattern, int priority) { Type = type; Priority = priority; Ignore = false; var tokenizer = new RegexTokenizer(); var tokens = tokenizer.Tokenize(pattern); var compiler = new RegexParser(tokens); Regex = compiler.Parse(); }
public void LoadFromText(string text) { Utils.ThrowException(text == null ? new ArgumentNullException("text") : null); mTaggedWords.Clear(); mTeiHeader = null; RegexTokenizer tokenizer = new RegexTokenizer(); tokenizer.TokenRegex = @"\p{L}+(-\p{L}+)*"; tokenizer.IgnoreUnknownTokens = false; foreach (string word in tokenizer.GetTokens(text)) { mTaggedWords.Add(new TaggedWord(word, /*tag=*/ null, /*lemma=*/ null)); } }
public void MultipleMatchesTest() { const string text = "Traffic light has Red, Yellow and Green colors"; var sut = new RegexTokenizer(@"(Red|Yellow|Green)", new Color(ConsoleColor.Blue), RegexOptions.IgnoreCase); Assert.Collection(sut.Parse(text), x => Assert.Equal("Traffic light has ", x.Text), x => Assert.Equal("Red", x.Text), x => Assert.Equal(", ", x.Text), x => Assert.Equal("Yellow", x.Text), x => Assert.Equal(" and ", x.Text), x => Assert.Equal("Green", x.Text), x => Assert.Equal(" colors", x.Text) ); }
public void CanTokenizeAndParseQueryString() { string query = @" MATCH app = 'MyTestApp' AND ex IN ('System.NullReferenceException', 'System.FormatException') BETWEEN 2016-01-01 00:00:00 AND 2016-02-01 00:00:00 LIMIT 100 "; var tokenizer = new RegexTokenizer(); var tokenSequence = tokenizer.Tokenize(query).ToList(); var parser = new QueryParser(); var dataRepresentation = parser.Parse(tokenSequence); var json = JsonConvert.SerializeObject(dataRepresentation, Formatting.Indented, SerializerSettings); Assert.NotNull(json); }
private void CreateTokens(HtmlNodeCollection nodes, RegexTokenizer textBlockTokenizer) { foreach (HtmlNode node in nodes) { Token endTag; IEnumerable <Token> tokens = CreateToken(node, out endTag, textBlockTokenizer); if (tokens != null) { mTokenList.AddRange(tokens); } if (!mApplySkipRules || !mSkipTagList.Contains(node.Name.ToLower())) { CreateTokens(node.ChildNodes, textBlockTokenizer); } if (endTag != null) { mTokenList.Add(endTag); } } }
public void ValidateFile() { try { //Get all lines of the file as tokens ITokenizer tokenizer = new RegexTokenizer(); var tokenSequence = tokenizer.Tokenize(_fileContent).ToList(); //Parse all tokens Parser p = new Parser(); _expenses = p.Parse(tokenSequence); if (_expenses.Count == 0) { throw new Exception("Nothing to Process"); } } catch (Exception) { throw; } }
/** * Creates a token stream that tokenizes the given string into token terms * (aka words). * * @param fieldName * the name of the field to tokenize (currently ignored). * @param text * the string to tokenize * @return a new token stream */ public TokenStream TokenStream(String fieldName, String text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (text == null) throw new ArgumentException("text must not be null"); TokenStream stream; if (Regex == NON_WORD_PATTERN) { // fast path stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); } else if (Regex == WHITESPACE_PATTERN) { // fast path stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); } else { stream = new RegexTokenizer(text, Regex, toLowerCase); if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords); } return stream; }
static void Main(string[] args) { // load documents Utils.VerboseLine("Loading documents ..."); string[] docs = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt"); BowSpace bowSpace = new BowSpace(); bowSpace.StopWords = StopWords.EnglishStopWords; bowSpace.Stemmer = new PorterStemmer(); bowSpace.WordWeightType = WordWeightType.TfIdf; RegexTokenizer tokenizer = new RegexTokenizer(); tokenizer.IgnoreUnknownTokens = true; bowSpace.Tokenizer = tokenizer; bowSpace.Initialize(docs); // compute layout SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace); Vector2D[] coords = semSpc.ComputeLayout(); // build spatial index //Utils.VerboseLine("Building spatial index ..."); //SpatialIndex2D spatIdx = new SpatialIndex2D(); //spatIdx.BuildIndex(coords); //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000)); //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1); //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count); ArrayList <Vector2D> tmp = new ArrayList <Vector2D>(coords); tmp.Shuffle(); //tmp.RemoveRange(1000, tmp.Count - 1000); // compute elevation StreamWriter writer = new StreamWriter("c:\\elev.txt"); LayoutSettings ls = new LayoutSettings(800, 600); ls.AdjustmentType = LayoutAdjustmentType.Soft; ls.StdDevMult = 2; ls.FitToBounds = true; ls.MarginVert = 50; ls.MarginHoriz = 50; double[,] zMtx = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200); VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp"); for (int row = 0; row < zMtx.GetLength(0); row++) { for (int col = 0; col < zMtx.GetLength(1); col++) { writer.Write("{0}\t", zMtx[row, col]); } writer.WriteLine(); } writer.Close(); // output coordinates StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv"); for (int i = 0; i < coords.Length; i++) { //if (i < points.Count) //{ // tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y); //} //else { tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y); } } tsvWriter.Close(); //// get document names //int k = 0; //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>(); //foreach (string doc in docs) //{ // string[] docInfo = doc.Split(' '); // layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++])); //} //Console.WriteLine(coords.Length); //Console.WriteLine(layout.Count); //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt"); //foreach (Pair<string, Vector2D> docPos in layout) //{ // writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y); //} //writer.Close(); }
static void Main(string[] args) { // load documents Utils.VerboseLine("Loading documents ..."); string[] docs = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt"); BowSpace bowSpace = new BowSpace(); bowSpace.StopWords = StopWords.EnglishStopWords; bowSpace.Stemmer = new PorterStemmer(); bowSpace.WordWeightType = WordWeightType.TfIdf; RegexTokenizer tokenizer = new RegexTokenizer(); tokenizer.IgnoreUnknownTokens = true; bowSpace.Tokenizer = tokenizer; bowSpace.Initialize(docs); // compute layout SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace); Vector2D[] coords = semSpc.ComputeLayout(); // build spatial index //Utils.VerboseLine("Building spatial index ..."); //SpatialIndex2D spatIdx = new SpatialIndex2D(); //spatIdx.BuildIndex(coords); //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000)); //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1); //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count); ArrayList<Vector2D> tmp = new ArrayList<Vector2D>(coords); tmp.Shuffle(); //tmp.RemoveRange(1000, tmp.Count - 1000); // compute elevation StreamWriter writer = new StreamWriter("c:\\elev.txt"); LayoutSettings ls = new LayoutSettings(800, 600); ls.AdjustmentType = LayoutAdjustmentType.Soft; ls.StdDevMult = 2; ls.FitToBounds = true; ls.MarginVert = 50; ls.MarginHoriz = 50; double[,] zMtx = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200); VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp"); for (int row = 0; row < zMtx.GetLength(0); row++) { for (int col = 0; col < zMtx.GetLength(1); col++) { writer.Write("{0}\t", zMtx[row, col]); } writer.WriteLine(); } writer.Close(); // output coordinates StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv"); for (int i = 0; i < coords.Length; i++) { //if (i < points.Count) //{ // tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y); //} //else { tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y); } } tsvWriter.Close(); //// get document names //int k = 0; //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>(); //foreach (string doc in docs) //{ // string[] docInfo = doc.Split(' '); // layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++])); //} //Console.WriteLine(coords.Length); //Console.WriteLine(layout.Count); //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt"); //foreach (Pair<string, Vector2D> docPos in layout) //{ // writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y); //} //writer.Close(); }
private IEnumerable <Token> CreateToken(HtmlNode node, out Token endTag, RegexTokenizer textBlockTokenizer) { IEnumerable <Token> tokens = null; endTag = null; if (node.NodeType == HtmlNodeType.Element) { // case 1: open tag like <i> without </i> (inside another tag like <b><i></b>) if (node._innerlength <= 0 && node._outerlength <= 0) { Token token = new Token(); token.mTokenType = TokenType.OpenTag; token.mStartIndex = node._outerstartindex; token.mLength = node._innerstartindex - node._outerstartindex; token.mTokenStr = mText.Substring(token.mStartIndex, token.mLength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; } // case 2: open tag like <i> without </i> (other cases) else if (node._innerlength <= 0 && node.EndNode == null) { Token token = new Token(); token.mTokenType = TokenType.OpenTag; token.mStartIndex = node._outerstartindex; token.mLength = node._outerlength; token.mTokenStr = mText.Substring(token.mStartIndex, token.mLength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; } // case 3: empty tag like <br> or <br/> else if (node._innerlength <= 0) { if (node.EndNode._outerstartindex != node._outerstartindex) // handle <tag></tag> pair { string startTagStr = mText.Substring(node._outerstartindex, node.EndNode._outerstartindex - node._outerstartindex); Token firstTag = new Token(); firstTag.mTokenType = TokenType.StartTag; firstTag.mStartIndex = node._outerstartindex; firstTag.mLength = startTagStr.Length; firstTag.mTokenStr = startTagStr; firstTag.mTagName = node.Name.ToLower(); string endTagStr = mText.Substring(node.EndNode._outerstartindex, node.EndNode._outerlength); Token secondTag = new Token(); secondTag.mTokenType = TokenType.EndTag; secondTag.mStartIndex = firstTag.mStartIndex + firstTag.mLength; secondTag.mLength = endTagStr.Length; secondTag.mTokenStr = endTagStr; secondTag.mTagName = firstTag.mTagName; tokens = new Token[] { firstTag, secondTag }; } else // handle <tag/> { Token token = new Token(); token.mTokenType = TokenType.EmptyTag; token.mStartIndex = node._outerstartindex; token.mLength = node._outerlength; token.mTokenStr = mText.Substring(node._outerstartindex, node._outerlength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; } } // case 4: closed tag like <b>some text</b> else { Token token = new Token(); token.mTokenType = TokenType.StartTag; token.mStartIndex = node._outerstartindex; token.mLength = node._innerstartindex - node._outerstartindex; token.mTokenStr = mText.Substring(token.mStartIndex, token.mLength); token.mTagName = node.Name.ToLower(); tokens = new Token[] { token }; endTag = new Token(); endTag.mTokenType = TokenType.EndTag; endTag.mStartIndex = node._innerstartindex + node._innerlength; endTag.mLength = node._outerstartindex + node._outerlength - endTag.mStartIndex; endTag.mTokenStr = mText.Substring(endTag.mStartIndex, endTag.mLength); endTag.mTagName = token.mTagName; } } else if (node.NodeType == HtmlNodeType.Text) { if (textBlockTokenizer == null) { Token token = new Token(); token.mTokenType = TokenType.Text; token.mStartIndex = node._innerstartindex; token.mLength = node._innerlength; token.mTokenStr = mText.Substring(node._innerstartindex, node._innerlength); if (mDecodeTextBlocks) { token.mTokenStr = HttpUtility.HtmlDecode(token.mTokenStr); } tokens = new Token[] { token }; } else // tokenize text block { tokens = new ArrayList <Token>(); string text = mText.Substring(node._innerstartindex, node._innerlength); textBlockTokenizer.Text = mDecodeTextBlocks ? HttpUtility.HtmlDecode(text) : text; RegexTokenizer.Enumerator tokEnum = (RegexTokenizer.Enumerator)textBlockTokenizer.GetEnumerator(); int baseIdx = node._innerstartindex; while (tokEnum.MoveNext()) { string tokenStr = tokEnum.Current; Token token = new Token(); token.mTokenType = GetTokenType(tokenStr); if (!mDecodeTextBlocks) { token.mStartIndex = baseIdx + tokEnum.CurrentTokenIdx; token.mLength = tokenStr.Length; } token.mTokenStr = tokenStr; ((ArrayList <Token>)tokens).Add(token); } if (((ArrayList <Token>)tokens).Count == 0) { tokens = null; } } } return(tokens); }